From c36d42c5ce1d4831709e802b5190bd93064c86f4 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Thu, 5 Oct 2023 14:21:53 +0000
Subject: [PATCH 01/69] try to implement the get-output_shape in attention.cc

---
 lib/op-attrs/include/op-attrs/ops/attention.h | 33 ++++++-----
 lib/op-attrs/include/op-attrs/ops/concat.h    |  2 +
 lib/op-attrs/src/attention.cc                 | 58 ++++++++++++-------
 lib/op-attrs/src/get_output_shapes.cc         |  2 +
 4 files changed, 60 insertions(+), 35 deletions(-)
diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h
index ec3e592607..79469206a3 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention.h
+++ b/lib/op-attrs/include/op-attrs/ops/attention.h
@@ -7,20 +7,6 @@
 
 namespace FlexFlow {
 
-struct MultiHeadAttentionAttrs {
-  req<int> embed_dim, num_heads, kdim, vdim;
-  req<float> dropout;
-  req<bool> bias, add_bias_kv, add_zero_attn;
-};
-FF_VISITABLE_STRUCT(MultiHeadAttentionAttrs,
-                    embed_dim,
-                    num_heads,
-                    kdim,
-                    vdim,
-                    dropout,
-                    bias,
-                    add_bias_kv,
-                    add_zero_attn);
 
 template <typename TensorType>
 struct MultiHeadAttentionInputs
@@ -43,6 +29,24 @@ struct MultiHeadAttentionInputs
   TensorType value;
 };
 
+struct MultiHeadAttentionAttrs {
+  req<int> embed_dim, num_heads, kdim, vdim;
+  req<float> dropout;
+  req<bool> bias, add_bias_kv, add_zero_attn;
+  bool is_valid(MultiHeadAttentionInputs<ParallelTensorShape> const &) const;
+};
+
+FF_VISITABLE_STRUCT(MultiHeadAttentionAttrs,
+                    embed_dim,
+                    num_heads,
+                    kdim,
+                    vdim,
+                    dropout,
+                    bias,
+                    add_bias_kv,
+                    add_zero_attn);
+CHECK_VALID_OP_ATTR(MultiHeadAttentionAttrs);
+
 int get_qProjSize(MultiHeadAttentionAttrs const &);
 int get_vProjSize(MultiHeadAttentionAttrs const &);
 int get_kProjSize(MultiHeadAttentionAttrs const &);
@@ -67,6 +71,7 @@ ParallelTensorShape
 ParallelTensorShape
     get_output_shape(MultiHeadAttentionAttrs const &,
                      MultiHeadAttentionInputs<ParallelTensorShape> const &);
+
 TensorShape get_output_shape(MultiHeadAttentionAttrs const &,
                              MultiHeadAttentionInputs<TensorShape> const &);
 
diff --git a/lib/op-attrs/include/op-attrs/ops/concat.h b/lib/op-attrs/include/op-attrs/ops/concat.h
index b9bd14a231..776963a2d2 100644
--- a/lib/op-attrs/include/op-attrs/ops/concat.h
+++ b/lib/op-attrs/include/op-attrs/ops/concat.h
@@ -10,7 +10,9 @@ namespace FlexFlow {
 
 struct ConcatAttrs {
   ff_dim_t axis;
+  bool is_valid(std::vector<ParallelTensorShape> const & input) const;
 };
+
 FF_VISITABLE_STRUCT(ConcatAttrs, axis);
 CHECK_VALID_OP_ATTR(ConcatAttrs);
 
diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index e9ae6ec803..7ccefb49ba 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/attention.h"
+#include "op-attrs/parallel_tensor_shape.h"
 
 namespace FlexFlow {
 
@@ -10,6 +11,14 @@ namespace FlexFlow {
 /*   return is_valid; */
 /* } */
 
+bool MultiHeadAttentionAttrs::is_valid(MultiHeadAttentionInputs<ParallelTensorShape> const & input) const {
+  bool valid = true;
+  valid &= is_valid(input.key);
+  valid &= is_valid(input.query);
+  valid &= is_valid(input.value);
+  return valid;
+}
+
 int get_qProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.kdim;
 }
@@ -52,34 +61,41 @@ TensorShape
 
   return {dims, DataType::FLOAT};
 }
+//these two functions are not defined in the attention.h
+// ParallelTensorShape get_output_shape(MultiHeadAttentionAttrs const &attrs,
+//                                      ParallelTensorShape const &query_shape,
+//                                      ParallelTensorShape const &key_shape,
+//                                      ParallelTensorShape const &value_shape) {
+//   /* ParallelDim replica_dim = query_shape.at(ff_dim_t(query_shape.num_dims() -
+//    * 2)); */
+//   /* replica_dim.size = replica_dim.degree; */
 
-ParallelTensorShape get_output_shape(MultiHeadAttentionAttrs const &attrs,
-                                     ParallelTensorShape const &query_shape,
-                                     ParallelTensorShape const &key_shape,
-                                     ParallelTensorShape const &value_shape) {
-  /* ParallelDim replica_dim = query_shape.at(ff_dim_t(query_shape.num_dims() -
-   * 2)); */
-  /* replica_dim.size = replica_dim.degree; */
+//   /* ParallelDim */
 
-  /* ParallelDim */
+//   ParallelTensorShape output_shape = query_shape;
+//   output_shape.at(ff_dim_t(output_shape.num_dims() - 1)).size = attrs.embed_dim;
+//   return output_shape;
+// }
 
-  ParallelTensorShape output_shape = query_shape;
+// TensorShape get_output_shape(MultiHeadAttentionAttrs const &attrs,
+//                              TensorShape const &query_shape,
+//                              TensorShape const &key_shape,
+//                              TensorShape const &value_shape) {
+//   ParallelTensorShape parallel_shape =
+//       get_output_shape(attrs,
+//                        static_cast<ParallelTensorShape>(query_shape),
+//                        static_cast<ParallelTensorShape>(key_shape),
+//                        static_cast<ParallelTensorShape>(value_shape));
+//   return get_tensor_shape_unsafe(parallel_shape);
+// }
+
+ParallelTensorShape get_output_shape(MultiHeadAttentionAttrs const & attrs,
+                                     MultiHeadAttentionInputs<ParallelTensorShape> const &inputs) {
+  ParallelTensorShape output_shape = inputs.query;
   output_shape.at(ff_dim_t(output_shape.num_dims() - 1)).size = attrs.embed_dim;
   return output_shape;
 }
 
-TensorShape get_output_shape(MultiHeadAttentionAttrs const &attrs,
-                             TensorShape const &query_shape,
-                             TensorShape const &key_shape,
-                             TensorShape const &value_shape) {
-  ParallelTensorShape parallel_shape =
-      get_output_shape(attrs,
-                       static_cast<ParallelTensorShape>(query_shape),
-                       static_cast<ParallelTensorShape>(key_shape),
-                       static_cast<ParallelTensorShape>(value_shape));
-  return get_tensor_shape_unsafe(parallel_shape);
-}
-
 } // namespace FlexFlow
 
 // Tensor FFModel::multihead_attention(const Tensor query,
diff --git a/lib/op-attrs/src/get_output_shapes.cc b/lib/op-attrs/src/get_output_shapes.cc
index f44a677873..b41912d577 100644
--- a/lib/op-attrs/src/get_output_shapes.cc
+++ b/lib/op-attrs/src/get_output_shapes.cc
@@ -20,4 +20,6 @@ TensorShape get_output_shape(AggregateAttrs const &attrs,
                        as_parallel(exp_preds)));
 }
 
+
+
 } // namespace FlexFlow

From 959c50f04f98057d4d628141d25cdb2569a36cb2 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sat, 7 Oct 2023 20:44:12 +0000
Subject: [PATCH 02/69] leave the implementation

---
 .../include/op-attrs/ops/batch_matmul.h         |  6 ++++++
 .../include/op-attrs/parallel_tensor_shape.h    |  2 ++
 lib/op-attrs/src/attention.cc                   |  6 +++---
 lib/op-attrs/src/batch_matmul.cc                | 17 +++++++++++++++++
 4 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
index c74824570c..dbcc292fd6 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
@@ -9,11 +9,17 @@ namespace FlexFlow {
 
 struct BatchMatmulAttrs {
   req<int> a_seq_length_dim, b_seq_length_dim;
+  bool is_valid(ParallelTensorShape const &,
+                                     ParallelTensorShape const &);
 };
 FF_VISITABLE_STRUCT(BatchMatmulAttrs, a_seq_length_dim, b_seq_length_dim);
 
 CHECK_VALID_OP_ATTR(BatchMatmulAttrs);
 
+ParallelTensorShape get_output_shape(BatchMatmulAttrs const &,
+                                     ParallelTensorShape const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index fd560352bb..ca980966e8 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -32,6 +32,8 @@ struct ParallelTensorShape : public use_visitable_cmp<ParallelTensorShape> {
   ParallelDim const &operator[](ff_dim_t const &) const;
   ParallelDim &operator[](ff_dim_t const &);
 
+  bool is_valid() const;
+
 public:
   ParallelTensorDims dims;
   DataType data_type;
diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index 7ccefb49ba..c8148cd45d 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -13,9 +13,9 @@ namespace FlexFlow {
 
 bool MultiHeadAttentionAttrs::is_valid(MultiHeadAttentionInputs<ParallelTensorShape> const & input) const {
   bool valid = true;
-  valid &= is_valid(input.key);
-  valid &= is_valid(input.query);
-  valid &= is_valid(input.value);
+  valid &= input.key.is_valid();
+  valid &= input.query.is_valid();
+  valid &= input.value.is_valid();
   return valid;
 }
 
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index 1cc8c5cfda..31ca31d81b 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -2,6 +2,23 @@
 
 namespace FlexFlow {
 
+ bool BatchMatmulAttrs::is_valid(ParallelTensorShape const & lhs, ParallelTensorShape const & rhs) {
+    if (!lhs.is_valid() || !rhs.is_valid()) {
+          return false;
+    }
+    if (lhs.num_dims() != rhs.num_dims()) {
+          return false;
+    }
+    return true;
+}
+
+ParallelTensorShape get_output_shape(BatchMatmulAttrs const &,
+                                     ParallelTensorShape const &,
+                                     ParallelTensorShape const &) {
+
+}
+
+
 /* bool BatchMatmulAttrs::is_valid( */
 /*     ParallelTensorShape const &lhs, ParallelTensorShape const &rhs) const {
  */

From be7e04f68b2d7f35ea9fe25a41011b6cc12939f7 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:10:31 +0000
Subject: [PATCH 03/69] implement the get output_shape for batch_matmul

---
 .../include/op-attrs/ops/batch_matmul.h       |  1 -
 lib/op-attrs/src/batch_matmul.cc              | 21 +++++++++++++------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
index dbcc292fd6..6473f923a2 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
@@ -19,7 +19,6 @@ CHECK_VALID_OP_ATTR(BatchMatmulAttrs);
 ParallelTensorShape get_output_shape(BatchMatmulAttrs const &,
                                      ParallelTensorShape const &,
                                      ParallelTensorShape const &);
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index 31ca31d81b..537c08f0aa 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -1,8 +1,11 @@
 #include "op-attrs/ops/batch_matmul.h"
+#include "op-attrs/ff_dim.h"
+#include "op-attrs/parallel_tensor_shape.h"
 
 namespace FlexFlow {
 
- bool BatchMatmulAttrs::is_valid(ParallelTensorShape const & lhs, ParallelTensorShape const & rhs) {
+//maybe we should add more check here
+bool BatchMatmulAttrs::is_valid(ParallelTensorShape const & lhs, ParallelTensorShape const & rhs) {
     if (!lhs.is_valid() || !rhs.is_valid()) {
           return false;
     }
@@ -12,11 +15,17 @@ namespace FlexFlow {
     return true;
 }
 
-ParallelTensorShape get_output_shape(BatchMatmulAttrs const &,
-                                     ParallelTensorShape const &,
-                                     ParallelTensorShape const &) {
-
-}
+//how to get the batch size? and lhs: [b, s1, k], rhs: [b, k, s1]
+ParallelTensorShape get_output_shape(BatchMatmulAttrs const & attrs,
+                                     ParallelTensorShape const & lhs,
+                                     ParallelTensorShape const & rhs) {
+  ParallelTensorShape   output_shape = lhs;
+  output_shape.at(ff_dim_t(0)).size = lhs.at(ff_dim_t(0)).size;
+  output_shape.at(ff_dim_t(1)).size = attrs.a_seq_length_dim;
+  output_shape.at(ff_dim_t(2)).size = attrs.b_seq_length_dim;
+  //TODO: Do we need to set the ParallelDim for output_shape
+  return output_shape;  
+}     
 
 
 /* bool BatchMatmulAttrs::is_valid( */

From f64bd64b189966f649f143a01b1f1437e1ab204d Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:13:18 +0000
Subject: [PATCH 04/69] implement the batch_norm

---
 lib/op-attrs/include/op-attrs/ops/batch_norm.h |  3 ++-
 lib/op-attrs/src/batch_matmul.cc               |  1 +
 lib/op-attrs/src/batch_norm.cc                 | 15 ++++++++++++++-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/batch_norm.h b/lib/op-attrs/include/op-attrs/ops/batch_norm.h
index 4ec823d4ae..65ab18c33c 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_norm.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_norm.h
@@ -9,10 +9,11 @@ namespace FlexFlow {
 
 struct BatchNormAttrs {
   req<bool> relu;
+  bool is_valid(ParallelTensorShape const &);
 };
 FF_VISITABLE_STRUCT(BatchNormAttrs, relu);
 
-ParallelTensorShape get_output_shape(BatchNormAttrs const &);
+ParallelTensorShape get_output_shape(BatchNormAttrs const &, ParallelTensorShape const &);
 
 CHECK_VALID_OP_ATTR(BatchNormAttrs);
 
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index 537c08f0aa..170ee655d2 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -12,6 +12,7 @@ bool BatchMatmulAttrs::is_valid(ParallelTensorShape const & lhs, ParallelTensorS
     if (lhs.num_dims() != rhs.num_dims()) {
           return false;
     }
+    
     return true;
 }
 
diff --git a/lib/op-attrs/src/batch_norm.cc b/lib/op-attrs/src/batch_norm.cc
index 4e352d5f1c..a1123667d2 100644
--- a/lib/op-attrs/src/batch_norm.cc
+++ b/lib/op-attrs/src/batch_norm.cc
@@ -1,3 +1,16 @@
 #include "op-attrs/ops/batch_norm.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+ 
+ bool BatchNormAttrs::is_valid(ParallelTensorShape const & input) {
+     if(!input.is_valid()) {
+         return false;
+     }
+    return true;
+ }
+
+ParallelTensorShape get_output_shape(BatchNormAttrs const & attrs, ParallelTensorShape const & input) {
+    return input; 
+}
+
+} // namespace FlexFlow

From 30d42558e29fe74c40fcdde6c4250fe5ee2e2230 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:15:30 +0000
Subject: [PATCH 05/69] implement the cast

---
 lib/op-attrs/include/op-attrs/ops/cast.h |  4 ++++
 lib/op-attrs/src/cast.cc                 | 18 +++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/cast.h b/lib/op-attrs/include/op-attrs/ops/cast.h
index 63563f8df8..39d6fe1cc1 100644
--- a/lib/op-attrs/include/op-attrs/ops/cast.h
+++ b/lib/op-attrs/include/op-attrs/ops/cast.h
@@ -10,9 +10,13 @@ namespace FlexFlow {
 
 struct CastAttrs {
   req<DataType> dtype;
+  bool is_valid(ParallelTensorShape const &input) const;
 };
 FF_VISITABLE_STRUCT(CastAttrs, dtype);
 
+ParallelTensorShape get_output_shape(CastAttrs const &,
+                                     ParallelTensorShape const &);
+
 CHECK_VALID_OP_ATTR(CastAttrs);
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/src/cast.cc b/lib/op-attrs/src/cast.cc
index e4ab178a7e..e7dab4689f 100644
--- a/lib/op-attrs/src/cast.cc
+++ b/lib/op-attrs/src/cast.cc
@@ -2,10 +2,18 @@
 
 namespace FlexFlow {
 
-/* bool CastAttrs::is_valid(ParallelTensorShape const &input) const { */
-/*   bool valid = input.is_valid(); */
-/*   valid &= (input.at(input.num_dims() - 1).degree == 1); */
-/*   return valid; */
-/* } */
+bool CastAttrs::is_valid(ParallelTensorShape const &input) const {
+    if (!input.is_valid()) {
+        return false;
+    }
+    return true;
+}
+
+ParallelTensorShape get_output_shape(CastAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  ParallelTensorShape output = input;
+  output.data_type = attrs.dtype;
+  return output;
+}
 
 } // namespace FlexFlow

From 1e5d742c1ef36bdf549272f88352eca4e1b9b325 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:18:24 +0000
Subject: [PATCH 06/69] implement the combine

---
 lib/op-attrs/include/op-attrs/ops/combine.h |  4 ++++
 lib/op-attrs/src/combine.cc                 | 22 +++++++++++----------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/combine.h b/lib/op-attrs/include/op-attrs/ops/combine.h
index deaba9e093..ffc04d4656 100644
--- a/lib/op-attrs/include/op-attrs/ops/combine.h
+++ b/lib/op-attrs/include/op-attrs/ops/combine.h
@@ -11,10 +11,14 @@ namespace FlexFlow {
 struct CombineAttrs {
   ff_dim_t combine_dim;
   req<int> combine_degree;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(CombineAttrs, combine_dim, combine_degree);
 CHECK_VALID_OP_ATTR(CombineAttrs);
 
+ParallelTensorShape get_output_shape(CombineAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/combine.cc b/lib/op-attrs/src/combine.cc
index cdca524538..8cfe6dfb8c 100644
--- a/lib/op-attrs/src/combine.cc
+++ b/lib/op-attrs/src/combine.cc
@@ -3,16 +3,18 @@
 
 namespace FlexFlow {
 
-/* bool CombineAttrs::is_valid(ParallelTensorShape const &input) const { */
-/*   return input.at(this->combine_legion_dim).degree % this->combine_degree ==
- * 0; */
-/* } */
+bool CombineAttrs::is_valid(ParallelTensorShape const &input) const {
+    if (!input.is_valid()) {
+        return false;
+    }
+    return true;
+}
 
-/* ParallelTensorShape CombineAttrs::output_shape(ParallelTensorShape const
- * &input_shape) const { */
-/*   ParallelTensorShape output = input_shape; */
-/*   output.at(this->combine_legion_dim).degree /= this->combine_degree; */
-/*   return output; */
-/* } */
+ParallelTensorShape get_output_shape(CombineAttrs const & attrs,
+                                     ParallelTensorShape const & input) {
+  ParallelTensorShape output = input_shape;
+  output.at(attrs.combine_dim).degree /= attrs.combine_degree;
+  return output;                                     
+}
 
 } // namespace FlexFlow

From 5f4cf5ccaad778ed54e351d32b8bb3781655960e Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:19:18 +0000
Subject: [PATCH 07/69] add concat

---
 lib/op-attrs/include/op-attrs/ops/concat.h |  3 +++
 lib/op-attrs/src/concat.cc                 | 25 +++++++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/concat.h b/lib/op-attrs/include/op-attrs/ops/concat.h
index 776963a2d2..afe3e0dd8d 100644
--- a/lib/op-attrs/include/op-attrs/ops/concat.h
+++ b/lib/op-attrs/include/op-attrs/ops/concat.h
@@ -13,6 +13,9 @@ struct ConcatAttrs {
   bool is_valid(std::vector<ParallelTensorShape> const & input) const;
 };
 
+ParallelTensorShape get_output_shape(ConcatAttrs const &,
+                                     std::vector<ParallelTensorShape> const &);
+
 FF_VISITABLE_STRUCT(ConcatAttrs, axis);
 CHECK_VALID_OP_ATTR(ConcatAttrs);
 
diff --git a/lib/op-attrs/src/concat.cc b/lib/op-attrs/src/concat.cc
index 065c58f365..e4b9496e69 100644
--- a/lib/op-attrs/src/concat.cc
+++ b/lib/op-attrs/src/concat.cc
@@ -2,13 +2,22 @@
 
 namespace FlexFlow {
 
-/* bool ConcatAttrs::is_valid( */
-/*     std::vector<ParallelTensorShape> const &input) const { */
-/*   bool valid = true; */
-/*   for (auto p : input) { */
-/*     valid &= p.is_valid(); */
-/*   } */
-/*   return valid; */
-/* } */
+bool ConcatAttrs::is_valid(
+    std::vector<ParallelTensorShape> const &input) const {
+  bool valid = true;
+  for (auto p : input) {
+    valid &= p.is_valid();
+  }
+  return valid;
+}
+
+ParallelTensorShape
+    get_output_shape(ConcatAttrs const &attrs,
+                     std::vector<ParallelTensorShape> const &inputs) {
+  ParallelTensorShape output = inputs[0];
+  for (auto &i : inputs) {
+    output.at(attrs.axis).size += i.at(attrs.axis).size;
+  }
+}
 
 } // namespace FlexFlow

From ba8386f3d3d1c8c46fc8e1d1b8171702002643f6 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:29:01 +0000
Subject: [PATCH 08/69] try to implement the conv2d

---
 lib/op-attrs/include/op-attrs/ops/conv_2d.h |  4 +++
 lib/op-attrs/src/conv_2d.cc                 | 30 +++++++--------------
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d.h b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
index 3034dc8c62..c8491877a7 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d.h
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
@@ -14,6 +14,7 @@ struct Conv2DAttrs {
       padding_w, groups;
   req<optional<Activation>> activation;
   req<bool> use_bias;
+  bool is_valid(TensorShape const & input) const;
 };
 
 FF_VISITABLE_STRUCT(Conv2DAttrs,
@@ -32,6 +33,9 @@ CHECK_VALID_OP_ATTR(Conv2DAttrs);
 TensorShape get_kernel_shape(Conv2DAttrs const &, TensorShape const &);
 TensorShape get_bias_shape(Conv2DAttrs const &, TensorShape const &);
 
+ParallelTensorShape get_output_shape(Conv2DAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index d000d31feb..9e37031eae 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -81,27 +81,17 @@ std::vector<ParallelDimMappingRecord>
   return mappings;
 }
 
-/* bool Conv2DAttrs::is_valid(ParallelTensorShape const &input_shape) const { */
-/*   bool is_valid = true; */
-/*   is_valid &= input_shape.is_valid(); */
-/*   is_valid &= this->calculate_output_shape(input_shape).is_valid(); */
-/*   is_valid &= this->calculate_kernel_shape(input_shape).is_valid(); */
-/*   if (use_bias) { */
-/*     is_valid &= this->calculate_bias_shape(input_shape).is_valid(); */
-/*   } */
-
-/*   // TODO FIXME: Currently disable parallelizing the height and width
- * dimension */
-/*   if (input_shape.at(0).degree > 1 || input_shape.at(1).degree > 1) { */
-/*     return false; */
-/*   } */
-
-/*   return is_valid; */
+bool Conv2DAttrs::is_valid(TensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  return true;
+}
 
-/* } */
+//according to pytorch, the input shape: []
+ParallelTensorShape get_output_shape(Conv2DAttrs const & attrs,
+                                     ParallelTensorShape const & input) {
 
-/* OperatorType Conv2DAttrs::op_type() const { */
-/*   return OP_CONV2D; */
-/* } */
+}
 
 } // namespace FlexFlow

From 5efd4ff6725a7f7a835b14241b32953df7e75530 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:36:51 +0000
Subject: [PATCH 09/69] add conv_2d

---
 lib/op-attrs/src/conv_2d.cc | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index 9e37031eae..d5d2ad90c1 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -88,10 +88,20 @@ bool Conv2DAttrs::is_valid(TensorShape const &input) const {
   return true;
 }
 
-//according to pytorch, the input shape: []
+//according to pytorch, the input shape: [b, input_channel, input_h, input_w]
+//kernel shape: [output_channel, input_channel, kernel_h, kernel_w]
+//we may have stide_h and padding_h
+//output shape: [b, output_channel, output_h, output_w]
+//output_h = (input_h + 2 * padding_h - kernel_h) / stride_h + 1
+//output_w = (input_w + 2 * padding_w - kernel_w) / stride_w + 1
 ParallelTensorShape get_output_shape(Conv2DAttrs const & attrs,
                                      ParallelTensorShape const & input) {
-
+  ParallelTensorShape output = input;
+  output.at(ff_dim_t(1)).size = attrs.out_channels;
+  output.at(ff_dim_t(2)).size = (input.at(ff_dim_t(2)).size +
+                                 2 * attrs.padding_h - attrs.kernel_h) / attrs.stride_h + 1;
+  output.at(ff_dim_t(3)).size = (input.at(ff_dim_t(3)).size + 2 * attrs.padding_w - attrs.kernel_w) /attrs.stride_w +1;
+  return output;
 }
 
 } // namespace FlexFlow

From fbdb407e5665cf0a895f2cc5ddf5892296dff7c1 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:40:59 +0000
Subject: [PATCH 10/69] add dropout

---
 lib/op-attrs/include/op-attrs/ops/dropout.h |  4 ++++
 lib/op-attrs/src/dropout.cc                 | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 lib/op-attrs/src/dropout.cc

diff --git a/lib/op-attrs/include/op-attrs/ops/dropout.h b/lib/op-attrs/include/op-attrs/ops/dropout.h
index 8e0049f526..04f244f27f 100644
--- a/lib/op-attrs/include/op-attrs/ops/dropout.h
+++ b/lib/op-attrs/include/op-attrs/ops/dropout.h
@@ -10,10 +10,14 @@ namespace FlexFlow {
 struct DropoutAttrs {
   req<float> rate;
   req<unsigned long long> seed;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(DropoutAttrs, rate, seed);
 CHECK_VALID_OP_ATTR(DropoutAttrs);
 
+ParallelTensorShape get_output_shape(DropoutAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/dropout.cc b/lib/op-attrs/src/dropout.cc
new file mode 100644
index 0000000000..dc60d8cf94
--- /dev/null
+++ b/lib/op-attrs/src/dropout.cc
@@ -0,0 +1,19 @@
+#include "dropout.h"
+#include "op-attrs/get_output_shapes.h"
+
+namespace FlexFlow {    
+
+bool DropoutAttrs::is_valid(ParallelTensorShape const & input) const {
+    if(!input.is_valid()) {
+        return false;
+    }
+    return true;
+}
+
+ParallelTensorShape get_output_shape(DropoutAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  ParallelTensorShape output = input;
+  return output;
+}
+
+} // namespace FlexFlow
\ No newline at end of file

From 32aa332743d9fb58b539d9c4b75adeebcd946e84 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:45:02 +0000
Subject: [PATCH 11/69] add element binary

---
 .../include/op-attrs/ops/element_binary.h     |  6 ++++
 lib/op-attrs/src/element_binary.cc            | 36 ++++++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/element_binary.h b/lib/op-attrs/include/op-attrs/ops/element_binary.h
index c4a096166d..f455333347 100644
--- a/lib/op-attrs/include/op-attrs/ops/element_binary.h
+++ b/lib/op-attrs/include/op-attrs/ops/element_binary.h
@@ -14,6 +14,8 @@ struct ElementBinaryAttrs {
   req<DataType> compute_type;
   req<bool> should_broadcast_lhs;
   req<bool> should_broadcast_rhs;
+  bool is_valid(ParallelTensorShape const & lhs,
+                ParallelTensorShape const & rhs) const;
 };
 FF_VISITABLE_STRUCT(ElementBinaryAttrs,
                     type,
@@ -22,6 +24,10 @@ FF_VISITABLE_STRUCT(ElementBinaryAttrs,
                     should_broadcast_rhs);
 CHECK_VALID_OP_ATTR(ElementBinaryAttrs);
 
+ParallelTensorShape get_output_shape(ElementBinaryAttrs const &,
+                                     ParallelTensorShape const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/element_binary.cc b/lib/op-attrs/src/element_binary.cc
index b713c6753f..2e0b8f8e34 100644
--- a/lib/op-attrs/src/element_binary.cc
+++ b/lib/op-attrs/src/element_binary.cc
@@ -1,3 +1,37 @@
 #include "op-attrs/ops/element_binary.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool ElementBinaryAttrs::is_valid(ParallelTensorShape const & input1,
+                                  ParallelTensorShape const & input2) const {
+    if(!input1.is_valid() || !input2.is_valid()) {
+        return false;
+    }
+    return true;
+}
+
+ParallelTensorShape get_output_shape(ElementBinaryAttrs const & atts,
+                                     ParallelTensorShape const & lhs, 
+                                     ParallelTensorShape const & rhs) {
+  ParallelTensorShape output = lhs.num_dims() >= rhs.num_dims() ? lhs : rhs;
+  for (int i = 0; i < output.num_dims(); i++) {
+    if (i >= lhs.num_dims()) {
+      output.at(ff_dim_t(i)) = rhs.at(ff_dim_t(i));
+    } else if (i >= rhs.num_dims()) {
+      output.at(ff_dim_t(i)) = lhs.at(ff_dim_t(i));
+    } else if (lhs.at(ff_dim_t(i)).size == rhs.at(ff_dim_t(i)).size) {
+      output.at(ff_dim_t(i)) = lhs.at(ff_dim_t(i));
+    } else if (lhs.at(ff_dim_t(i)).size == 1) {
+      output.at(ff_dim_t(i)) = rhs.at(ff_dim_t(i));
+    } else if (rhs.at(ff_dim_t(i)).size == 1) {
+      output.at(ff_dim_t(i)) = lhs.at(ff_dim_t(i));
+    } else {
+      assert(false && "Operands could not be broadcast together");
+      exit(0);
+    }
+  }
+
+  return output;
+}
+
+} // namespace FlexFlow

From 9a85d59a70b2d85647eae540adf96bab6fd81a25 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:46:21 +0000
Subject: [PATCH 12/69] add elemenet unary

---
 .../include/op-attrs/ops/element_unary.h         |  4 ++++
 lib/op-attrs/src/element_unary.cc                | 16 +++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/element_unary.h b/lib/op-attrs/include/op-attrs/ops/element_unary.h
index 1b72e83cb5..562c50e4ed 100644
--- a/lib/op-attrs/include/op-attrs/ops/element_unary.h
+++ b/lib/op-attrs/include/op-attrs/ops/element_unary.h
@@ -18,10 +18,14 @@ CHECK_VALID_OP_ATTR(ElementScalarUnaryAttrs);
 
 struct ElementUnaryAttrs {
   req<Op> op;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ElementUnaryAttrs, op);
 CHECK_VALID_OP_ATTR(ElementUnaryAttrs);
 
+ParallelTensorShape get_output_shape(ElementUnaryAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/element_unary.cc b/lib/op-attrs/src/element_unary.cc
index 481151fafb..b59ba92529 100644
--- a/lib/op-attrs/src/element_unary.cc
+++ b/lib/op-attrs/src/element_unary.cc
@@ -1,3 +1,17 @@
 #include "op-attrs/ops/element_unary.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool ElementUnaryAttrs::is_valid(ParallelTensorShape const & input) const {
+    if(!input.is_valid()) {
+        return false;
+    }
+    return true;
+}
+
+ParallelTensorShape get_output_shape(ElementUnaryAttrs const & atts,
+                                     ParallelTensorShape const & input) {
+  ParallelTensorShape output = input;
+  return output;
+
+} // namespace FlexFlow

From 698a72980d923551cb6ed81c82d6831436d56c33 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:53:30 +0000
Subject: [PATCH 13/69] add embedding

---
 lib/op-attrs/include/op-attrs/ops/embedding.h |  4 ++++
 lib/op-attrs/src/embedding.cc                 | 21 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/embedding.h b/lib/op-attrs/include/op-attrs/ops/embedding.h
index 8b00fa22ce..c7af920d5a 100644
--- a/lib/op-attrs/include/op-attrs/ops/embedding.h
+++ b/lib/op-attrs/include/op-attrs/ops/embedding.h
@@ -19,10 +19,14 @@ struct EmbeddingAttrs {
   req<int> num_entries, out_channels;
   req<AggregateOp> aggr;
   req<DataType> data_type;
+  bool is_valid(ParallelTensorShape const & input) const;
 };
 FF_VISITABLE_STRUCT(EmbeddingAttrs, num_entries, out_channels, aggr, data_type);
 CHECK_VALID_OP_ATTR(EmbeddingAttrs);
 
+ParallelTensorShape get_output_shape(EmbeddingAttrs const &,
+                                     ParallelTensorShape const &);
+
 TensorShape get_weights_shape(EmbeddingAttrs const &, TensorShape const &);
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/embedding.cc b/lib/op-attrs/src/embedding.cc
index 02cbfaa031..b782e1282c 100644
--- a/lib/op-attrs/src/embedding.cc
+++ b/lib/op-attrs/src/embedding.cc
@@ -1,3 +1,22 @@
 #include "op-attrs/ops/embedding.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool EmbeddingAttrs::is_valid(ParallelTensorShape const & input) const {
+    if(!input.is_valid()) {
+        return false;
+    }
+    return true;
+}
+
+//pytorch nn.Embedding
+//Embedding OP: (num_embeddings, embedding_dim) (num_entries, out_channels)
+//Input: (batch_size, seq_len)
+//Output: (batch_size, seq_len, embedding_dim)
+ParallelTensorShape get_output_shape(EmbeddingAttrs const & atts,
+                                     ParallelTensorShape const & input) {
+  ParallelTensorShape output = input;
+  output.at(ff_dim_t(1)).size = input.at(ff_dim_t(1)).size;
+  output.at(ff_dim_t(2)).size= atts.out_channels;
+  return output;
+} // namespace FlexFlow

From 72f43bcec6560e11e78e472711542be246d09a6f Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 8 Oct 2023 21:57:51 +0000
Subject: [PATCH 14/69] add flat

---
 lib/op-attrs/src/flat.cc | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lib/op-attrs/src/flat.cc b/lib/op-attrs/src/flat.cc
index 75d31beae4..65da5be1f0 100644
--- a/lib/op-attrs/src/flat.cc
+++ b/lib/op-attrs/src/flat.cc
@@ -14,6 +14,22 @@ namespace Output {
 constexpr int NUMDIM = 3, CHANNEL = 0, SAMPLE = 1, REPLICA = 2;
 }
 
+//flat is like the pytorch view 
+//tensor = torch.randn(2, 3, 4)  ,flattened_tensor = tensor.view(-1) #shape: (24) 
+ParallelTensorShape get_output_shape(FlatAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  ParallelTensorShape output_shape(input.dims, input.data_type);
+
+  output_shape.at(ff_dim_t(Output::CHANNEL)).size =
+      input.at(ff_dim_t(Input::CHANNEL)).size *
+      input.at(ff_dim_t(Input::HEIGHT)).size *
+      input.at(ff_dim_t(Input::WIDTH)).size;
+  output_shape.at(ff_dim_t(Output::CHANNEL)).degree =
+      input.at(ff_dim_t(Input::CHANNEL)).degree;
+
+  return output_shape;
+}
+
 /* bool FlatAttrs::is_valid(ParallelTensorShape const &input) const { */
 /*   ParallelTensorShape output_shape = this->calculate_output_shape(input); */
 

From e0f05be4d92ca7f5839e516acef1cefc9f2bc859 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 13:51:34 +0000
Subject: [PATCH 15/69] leave the get_otput_shape for gather

---
 lib/op-attrs/include/op-attrs/ops/gather.h |  5 +++++
 lib/op-attrs/src/gather.cc                 | 20 ++++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/lib/op-attrs/include/op-attrs/ops/gather.h b/lib/op-attrs/include/op-attrs/ops/gather.h
index ca2406ef75..55b438cc15 100644
--- a/lib/op-attrs/include/op-attrs/ops/gather.h
+++ b/lib/op-attrs/include/op-attrs/ops/gather.h
@@ -10,10 +10,15 @@ namespace FlexFlow {
 
 struct GatherAttrs {
   ff_dim_t dim;
+  bool is_valid(ParallelTensorShape const &, ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(GatherAttrs, dim);
 CHECK_VALID_OP_ATTR(GatherAttrs);
 
+std::vector<ParallelTensorShape> get_output_shapes(GatherAttrs const &,
+                                                   ParallelTensorShape const &,
+                                                   ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/gather.cc b/lib/op-attrs/src/gather.cc
index 4f2c13c794..66f4163a6d 100644
--- a/lib/op-attrs/src/gather.cc
+++ b/lib/op-attrs/src/gather.cc
@@ -2,6 +2,26 @@
 
 namespace FlexFlow {
 
+bool GatherAttrs::is_valid(ParallelTensorShape const &lhs,
+                           ParallelTensorShape const &rhs) const {
+  if (lhs.dims.num_dims() != rhs.dims.num_dims()) {
+    return false;
+  }
+  for (auto i : lhs.dims) {
+    if (ff_dim_t(i.size) != this->dim &&
+        lhs.at(ff_dim_t(i.size)).size < rhs.at(ff_dim_t(i.size)).size) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<ParallelTensorShape> get_output_shapes(GatherAttrs const & attrs,
+                                                   ParallelTensorShape const & lhs,
+                                                   ParallelTensorShape const & rhs ) {
+
+}
+
 /* bool GatherAttrs::is_valid(ParallelTensorShape const &lhs,
  * ParallelTensorShape const &rhs) const { */
 /*   if (lhs.num_dims() != rhs.num_dims()) { */

From 1d18f35c6066ccafc3407342528ff5cb9864d8ae Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 14:03:43 +0000
Subject: [PATCH 16/69] skip groupby

---
 lib/op-attrs/include/op-attrs/ops/gather.h  |  4 ++--
 lib/op-attrs/include/op-attrs/ops/groupby.h |  5 +++++
 lib/op-attrs/src/gather.cc                  |  4 +++-
 lib/op-attrs/src/groupby.cc                 | 19 ++++++++++++++++++-
 4 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/gather.h b/lib/op-attrs/include/op-attrs/ops/gather.h
index 55b438cc15..44ec8b1fd7 100644
--- a/lib/op-attrs/include/op-attrs/ops/gather.h
+++ b/lib/op-attrs/include/op-attrs/ops/gather.h
@@ -15,10 +15,10 @@ struct GatherAttrs {
 FF_VISITABLE_STRUCT(GatherAttrs, dim);
 CHECK_VALID_OP_ATTR(GatherAttrs);
 
+
 std::vector<ParallelTensorShape> get_output_shapes(GatherAttrs const &,
-                                                   ParallelTensorShape const &,
+                                                   ParallelTensorShape const & ,
                                                    ParallelTensorShape const &);
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/groupby.h b/lib/op-attrs/include/op-attrs/ops/groupby.h
index 174c40242e..702cbd2a1c 100644
--- a/lib/op-attrs/include/op-attrs/ops/groupby.h
+++ b/lib/op-attrs/include/op-attrs/ops/groupby.h
@@ -10,10 +10,15 @@ namespace FlexFlow {
 struct Group_byAttrs {
   req<int> n;
   req<float> alpha;
+  bool is_valid(ParallelTensorShape const &, ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(Group_byAttrs, n, alpha);
 CHECK_VALID_OP_ATTR(Group_byAttrs);
 
+ParallelTensorShape get_output_shape(Group_byAttrs const &,
+                                     ParallelTensorShape const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/gather.cc b/lib/op-attrs/src/gather.cc
index 66f4163a6d..d514b439d4 100644
--- a/lib/op-attrs/src/gather.cc
+++ b/lib/op-attrs/src/gather.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/gather.h"
+#include "utils/exception.decl.h"
 
 namespace FlexFlow {
 
@@ -16,10 +17,11 @@ bool GatherAttrs::is_valid(ParallelTensorShape const &lhs,
   return true;
 }
 
+//todo: why return a vector?
 std::vector<ParallelTensorShape> get_output_shapes(GatherAttrs const & attrs,
                                                    ParallelTensorShape const & lhs,
                                                    ParallelTensorShape const & rhs ) {
-
+  NOT_IMPLEMENTED();
 }
 
 /* bool GatherAttrs::is_valid(ParallelTensorShape const &lhs,
diff --git a/lib/op-attrs/src/groupby.cc b/lib/op-attrs/src/groupby.cc
index 96c9db2838..efe22e2a25 100644
--- a/lib/op-attrs/src/groupby.cc
+++ b/lib/op-attrs/src/groupby.cc
@@ -1,3 +1,20 @@
 #include "op-attrs/ops/groupby.h"
+#include "utils/exception.decl.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool Group_byAttrs::is_valid(ParallelTensorShape const &lhs,
+                             ParallelTensorShape const &rhs) const {
+    if(!lhs.is_valid() || !rhs.is_valid()) {
+        return false;
+    }
+   NOT_IMPLEMENTED();
+}
+
+ParallelTensorShape get_output_shape(Group_byAttrs const & attrs,
+                                     ParallelTensorShape const & lhs,
+                                     ParallelTensorShape const & rhs) {
+    NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow

From c81d5f81179f5b1e0180ff74b2a55c2b56491bf6 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 14:10:52 +0000
Subject: [PATCH 17/69] add layer norm

---
 lib/op-attrs/include/op-attrs/ops/layer_norm.h |  4 ++++
 lib/op-attrs/src/layer_norm.cc                 | 18 +++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/layer_norm.h b/lib/op-attrs/include/op-attrs/ops/layer_norm.h
index dab055b2c9..15b6729262 100644
--- a/lib/op-attrs/include/op-attrs/ops/layer_norm.h
+++ b/lib/op-attrs/include/op-attrs/ops/layer_norm.h
@@ -12,10 +12,14 @@ struct LayerNormAttrs {
   stack_vector<ff_dim_t, MAX_TENSOR_DIM> axes;
   req<bool> elementwise_affine;
   req<float> eps;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(LayerNormAttrs, axes, elementwise_affine, eps);
 CHECK_VALID_OP_ATTR(LayerNormAttrs);
 
+ParallelTensorShape get_output_shape(LayerNormAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/layer_norm.cc b/lib/op-attrs/src/layer_norm.cc
index ab88de3622..8a660f733c 100644
--- a/lib/op-attrs/src/layer_norm.cc
+++ b/lib/op-attrs/src/layer_norm.cc
@@ -1,3 +1,19 @@
 #include "op-attrs/ops/layer_norm.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool LayerNormAttrs::is_valid(ParallelTensorShape const & input) const {
+    if(!input.is_valid()) {
+        return false;
+    }
+    return true;
+}
+
+//todo: maybe we need to set the degree of parallel_dim
+ParallelTensorShape get_output_shape(LayerNormAttrs const & attrs,
+                                     ParallelTensorShape const & input) {
+    ParallelTensorShape output = input;
+    return output;
+}
+
+} // namespace FlexFlow

From d721da71c8463394939e7a1829fb26a1e7dbc066 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 14:20:00 +0000
Subject: [PATCH 18/69] add linear

---
 lib/op-attrs/include/op-attrs/ops/linear.h |  4 ++++
 lib/op-attrs/src/linear.cc                 | 23 +++++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/linear.h b/lib/op-attrs/include/op-attrs/ops/linear.h
index 3be8be2040..54a3864e8d 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear.h
+++ b/lib/op-attrs/include/op-attrs/ops/linear.h
@@ -29,11 +29,15 @@ struct LinearAttrs {
   req<DataType> data_type;
   req<Activation> activation;
   req<optional<RegularizerAttrs>> regularizer;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(
     LinearAttrs, out_channels, use_bias, data_type, activation, regularizer);
 CHECK_VALID_OP_ATTR(LinearAttrs);
 
+ParallelTensorShape get_output_shape(LinearAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/linear.cc b/lib/op-attrs/src/linear.cc
index 16a94e7f6c..3aa361c342 100644
--- a/lib/op-attrs/src/linear.cc
+++ b/lib/op-attrs/src/linear.cc
@@ -1,3 +1,24 @@
 #include "op-attrs/ops/linear.h"
+#include "op-attrs/ff_dim.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool LinearAttrs::is_valid(ParallelTensorShape const & input) const {
+    if(!input.is_valid()) {
+        return false;
+    }
+    return true;
+}
+
+//pytorch: input shape:{batch_size, input_channels}
+//pytorch linearattrs: should be {input_channels, output_channels} 
+//pytorch: output shape:{batch_size, output_channels}
+//question: the Linearattrs doesn't have input_channels
+ParallelTensorShape get_output_shape(LinearAttrs const & atts,
+                                     ParallelTensorShape const & input) {
+    ParallelTensorShape out_shape = input;
+    out_shape.at(ff_dim_t(0)).size = atts.out_channels;
+    return out_shape;
+}
+
+} // namespace FlexFlow

From 23a266e019e5e73c4fa26e119be28f0c5cd2aee8 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 14:30:31 +0000
Subject: [PATCH 19/69] add pool2d

---
 lib/op-attrs/include/op-attrs/ops/pool_2d.h |  4 +++
 lib/op-attrs/src/pool_2d.cc                 | 31 +++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d.h b/lib/op-attrs/include/op-attrs/ops/pool_2d.h
index efe29b3b2e..b688be85f5 100644
--- a/lib/op-attrs/include/op-attrs/ops/pool_2d.h
+++ b/lib/op-attrs/include/op-attrs/ops/pool_2d.h
@@ -17,6 +17,7 @@ struct Pool2DAttrs {
   req<int> kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w;
   req<PoolOp> pool_type;
   req<Activation> activation;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(Pool2DAttrs,
                     kernel_h,
@@ -29,6 +30,9 @@ FF_VISITABLE_STRUCT(Pool2DAttrs,
                     activation);
 CHECK_VALID_OP_ATTR(Pool2DAttrs);
 
+ParallelTensorShape get_output_shape(Pool2DAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 namespace fmt {
diff --git a/lib/op-attrs/src/pool_2d.cc b/lib/op-attrs/src/pool_2d.cc
index 0867aeb344..8587b114a6 100644
--- a/lib/op-attrs/src/pool_2d.cc
+++ b/lib/op-attrs/src/pool_2d.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/pool_2d.h"
+#include "op-attrs/ff_dim.h"
 #include "parallel_dim_mapping_record.h"
 #include "parallel_dim_mapping_record_solver.h"
 
@@ -39,6 +40,36 @@ static ParallelDimMappingSolution
   return solve_parallel_dim_mappings(construct_mappings(input), {input}, 0, 1);
 }
 
+bool Pool2DAttrs::is_valid(ParallelTensorShape const & input) const {
+    if(!input.is_valid()) {
+        return false;
+    }
+    return true;
+}
+
+//pytorch: we have two type of pool2d, maxpool2d and avgpool2d
+//input shape: (batch_size, channels, input_height, input_width)
+//for avgpool2d, output shape: (batch_size, channels, 1, 1)
+//for maxpool2d, output shape: (batch_size, channels, output_height, output_width)
+//output_height = (input_height + 2 * padding_h - kernel_h) / stride_h + 1
+//output_width = (input_width + 2 * padding_w - kernel_w) / stride_w + 1
+ParallelTensorShape get_output_shape(Pool2DAttrs const & attrs,
+                                     ParallelTensorShape const & input) {
+    ParallelTensorShape output_shape = input;    
+    if(attrs.pool_type == PoolOp::AVG) {
+      output_shape.at(ff_dim_t(2)).size = 1;
+      output_shape.at(ff_dim_t(3)).size = 1;
+    } else if(attrs.pool_type == PoolOp::MAX) {
+      output_shape.at(ff_dim_t(2)).size = (input.at(ff_dim_t(2)).size + 2 * attrs.padding_h - attrs.kernel_h) / attrs.stride_h + 1;
+      output_shape.at(ff_dim_t(3)).size = (input.at(ff_dim_t(3)).size + 2 * attrs.padding_w - attrs.kernel_w) / attrs.stride_w + 1;
+    } else {
+      assert(false && "unsupported pool type");
+    }
+    return output_shape;                                
+}
+
+}
+
 /* ParallelTensorShape Pool2DAttrs::calculate_output_shape(ParallelTensorShape
  * const &input) const { */
 /*   return solve_mappings(input).output_shapes.at(0); */

From ee9bbaabfe5c9c78bc7f0b6424960debf91db97c Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 14:37:15 +0000
Subject: [PATCH 20/69] leave the reduce

---
 lib/op-attrs/include/op-attrs/ops/reduce.h |  4 ++++
 lib/op-attrs/src/reduce.cc                 | 14 +++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/reduce.h b/lib/op-attrs/include/op-attrs/ops/reduce.h
index 193d3b0dc8..c18d4cd888 100644
--- a/lib/op-attrs/include/op-attrs/ops/reduce.h
+++ b/lib/op-attrs/include/op-attrs/ops/reduce.h
@@ -14,10 +14,14 @@ struct ReduceAttrs {
   stack_vector<ff_dim_t, MAX_TENSOR_DIM> axes;
   req<Op> op_type;
   req<bool> keepdims;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ReduceAttrs, axes, op_type, keepdims);
 CHECK_VALID_OP_ATTR(ReduceAttrs);
 
+ParallelTensorShape get_output_shape(ReduceAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/reduce.cc b/lib/op-attrs/src/reduce.cc
index 9d1770d5be..f6e4c1c829 100644
--- a/lib/op-attrs/src/reduce.cc
+++ b/lib/op-attrs/src/reduce.cc
@@ -1,3 +1,15 @@
 #include "op-attrs/ops/reduce.h"
+#include "utils/exception.decl.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool ReduceAttrs::is_valid(ParallelTensorShape const & input) const {
+    NOT_IMPLEMENTED()
+}
+
+ParallelTensorShape get_output_shape(ReduceAttrs const & attrs,
+                                     ParallelTensorShape const & input) {
+    NOT_IMPLEMENTED()
+}
+
+} // namespace FlexFlow

From 5e354baf7e90a902ce2158acc7d06c87e6a5ac1c Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 14:42:17 +0000
Subject: [PATCH 21/69] add reduction

---
 lib/op-attrs/include/op-attrs/ops/reduction.h |  4 ++++
 lib/op-attrs/src/reduction.cc                 | 15 +++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/lib/op-attrs/include/op-attrs/ops/reduction.h b/lib/op-attrs/include/op-attrs/ops/reduction.h
index f848f879fc..a8e7abd318 100644
--- a/lib/op-attrs/include/op-attrs/ops/reduction.h
+++ b/lib/op-attrs/include/op-attrs/ops/reduction.h
@@ -11,10 +11,14 @@ namespace FlexFlow {
 struct ReductionAttrs {
   ff_dim_t reduction_dim;
   req<int> reduction_degree;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ReductionAttrs, reduction_dim, reduction_degree);
 CHECK_VALID_OP_ATTR(ReductionAttrs);
 
+ParallelTensorShape get_output_shape(ReductionAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/reduction.cc b/lib/op-attrs/src/reduction.cc
index 22fc9bab6a..9196000a05 100644
--- a/lib/op-attrs/src/reduction.cc
+++ b/lib/op-attrs/src/reduction.cc
@@ -10,4 +10,19 @@ namespace FlexFlow {
 /*   return output; */
 /* } */
 
+bool ReductionAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  return true;
+}
+
+ParallelTensorShape get_output_shape(ReductionAttrs const &attrs,
+                                     ParallelTensorShape const &input_shape) {
+  ParallelTensorShape output(input_shape.dims, input_shape.data_type);
+  output.at(attrs.reduction_dim).degree /= attrs.reduction_degree;
+  output.at(attrs.reduction_dim).size /= attrs.reduction_degree;
+  return output;
+}
+
 } // namespace FlexFlow

From e95f195642d30641716a328140c8759250c00bbc Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 15:04:56 +0000
Subject: [PATCH 22/69] add reshape

---
 .../include/op-attrs/ops/repartition.h        |  4 +++
 lib/op-attrs/include/op-attrs/ops/replicate.h |  4 +++
 lib/op-attrs/include/op-attrs/ops/reshape.h   |  5 +++
 lib/op-attrs/src/repartition.cc               | 18 ++++++++++
 lib/op-attrs/src/replicate.cc                 | 17 +++++++++-
 lib/op-attrs/src/reshape.cc                   | 33 ++++++++++++++++++-
 6 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/repartition.h b/lib/op-attrs/include/op-attrs/ops/repartition.h
index 83c4ae870b..a795017bf4 100644
--- a/lib/op-attrs/include/op-attrs/ops/repartition.h
+++ b/lib/op-attrs/include/op-attrs/ops/repartition.h
@@ -11,10 +11,14 @@ namespace FlexFlow {
 struct RepartitionAttrs {
   ff_dim_t repartition_dim;
   req<int> repartition_degree;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(RepartitionAttrs, repartition_dim, repartition_degree);
 CHECK_VALID_OP_ATTR(RepartitionAttrs);
 
+ParallelTensorShape get_output_shape(RepartitionAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/replicate.h b/lib/op-attrs/include/op-attrs/ops/replicate.h
index 92e64a4120..c2a9b6abf0 100644
--- a/lib/op-attrs/include/op-attrs/ops/replicate.h
+++ b/lib/op-attrs/include/op-attrs/ops/replicate.h
@@ -11,10 +11,14 @@ namespace FlexFlow {
 struct ReplicateAttrs {
   ff_dim_t replicate_dim;
   req<int> replicate_degree;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ReplicateAttrs, replicate_dim, replicate_degree);
 CHECK_VALID_OP_ATTR(ReplicateAttrs);
 
+ParallelTensorShape get_output_shape(ReplicateAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/reshape.h b/lib/op-attrs/include/op-attrs/ops/reshape.h
index b118482a2b..cf0eb07a95 100644
--- a/lib/op-attrs/include/op-attrs/ops/reshape.h
+++ b/lib/op-attrs/include/op-attrs/ops/reshape.h
@@ -3,16 +3,21 @@
 
 #include "core.h"
 #include "op-attrs/tensor_shape.h"
+#include "op-attrs/parallel_tensor_shape.h"
 #include "utils/visitable.h"
 
 namespace FlexFlow {
 
 struct ReshapeAttrs {
   TensorShape shape;
+  bool is_valid(ParallelTensorShape  const &) const;
 };
 FF_VISITABLE_STRUCT(ReshapeAttrs, shape);
 CHECK_VALID_OP_ATTR(ReshapeAttrs);
 
+ParallelTensorShape get_output_shape(ReshapeAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/repartition.cc b/lib/op-attrs/src/repartition.cc
index 672e68b4f6..ad037b7cf6 100644
--- a/lib/op-attrs/src/repartition.cc
+++ b/lib/op-attrs/src/repartition.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/repartition.h"
+#include "op-attrs/parallel_dim.h"
 
 namespace FlexFlow {
 
@@ -8,4 +9,21 @@ namespace FlexFlow {
 /*   return (dim.size % this->repartition_degree * dim.degree == 0); */
 /* } */
 
+bool RepartitionAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  ParallelDim dim = input.at(this->repartition_dim);
+  return (dim.size % this->repartition_degree * dim.degree == 0);
+}
+
+//this may be wrong partition by n multiplies degree by n and keeps shape the same
+ParallelTensorShape get_output_shape(RepartitionAttrs const &attrs,
+                                     ParallelTensorShape const &input_shape) {
+  ParallelTensorShape output(input_shape.dims, input_shape.data_type);
+  output.at(attrs.repartition_dim).degree *= attrs.repartition_degree;
+  return output;
+}
+
+
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/replicate.cc b/lib/op-attrs/src/replicate.cc
index 73ad288d8c..26861d3624 100644
--- a/lib/op-attrs/src/replicate.cc
+++ b/lib/op-attrs/src/replicate.cc
@@ -1,3 +1,18 @@
 #include "op-attrs/ops/replicate.h"
+#include "utils/exception.decl.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool ReplicateAttrs::is_valid(ParallelTensorShape const &input) const {
+  NOT_IMPLEMENTED();
+}
+
+//replicate by n multiplies degree by n and shape by n
+ParallelTensorShape get_output_shape(ReplicateAttrs const & attrs,
+                                     ParallelTensorShape const & input) {
+  NOT_IMPLEMENTED();
+}
+
+
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/reshape.cc b/lib/op-attrs/src/reshape.cc
index e8349e1f26..777f2eef1b 100644
--- a/lib/op-attrs/src/reshape.cc
+++ b/lib/op-attrs/src/reshape.cc
@@ -1,3 +1,34 @@
 #include "op-attrs/ops/reshape.h"
+#include "op-attrs/ff_dim.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+//pytorch: the input: [2,3,4], shape maybe [-1,6]， should we add this? and the output is [4, 6]
+bool ReshapeAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  std::size_t input_volume =1;
+    for (int i = 0; i < input.num_dims(); i++) {
+        input_volume *= input.at(ff_dim_t(i)).size;
+    }
+  std::size_t attrs_volume =1;
+    for (int i = 0; i < this->shape.dims.num_dims(); i++) {
+        attrs_volume *=  this->shape.at(ff_dim_t(i));
+    }
+    return (input_volume == attrs_volume);
+}
+
+//pytorch: the input: [2,3,4], shape maybe [-1,6]， should we add this? and the output is [4, 6]
+//currently we doesn't consider the case of -1,we can support this later
+//the input:[2,3,4], attrs.shape:[4,6], the output is [4, 6]
+ParallelTensorShape get_output_shape(ReshapeAttrs const & attrs,
+                                     ParallelTensorShape const & input) {
+
+    assert(attrs.is_valid(input) && "input is not valid");
+    ParallelTensorDims dims{attrs.shape.dims};
+    ParallelTensorShape output{dims, input.data_type};
+    return output;
+}
+
+} // namespace FlexFlow

From f053a20c902ff8a17300184bbd13b4e7a771671c Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 15:19:42 +0000
Subject: [PATCH 23/69] add reverse draft

---
 lib/op-attrs/include/op-attrs/ops/reverse.h | 5 +++++
 lib/op-attrs/src/reduce.cc                  | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/reverse.h b/lib/op-attrs/include/op-attrs/ops/reverse.h
index 6030285f14..a0bd38b9c9 100644
--- a/lib/op-attrs/include/op-attrs/ops/reverse.h
+++ b/lib/op-attrs/include/op-attrs/ops/reverse.h
@@ -4,15 +4,20 @@
 #include "core.h"
 #include "op-attrs/ff_dim.h"
 #include "utils/visitable.h"
+#include "op-attrs/parallel_tensor_shape.h"
 
 namespace FlexFlow {
 
 struct ReverseAttrs {
   ff_dim_t axis;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ReverseAttrs, axis);
 CHECK_VALID_OP_ATTR(ReverseAttrs);
 
+ParallelTensorShape get_output_shape(ReverseAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/reduce.cc b/lib/op-attrs/src/reduce.cc
index f6e4c1c829..2c42e5cfad 100644
--- a/lib/op-attrs/src/reduce.cc
+++ b/lib/op-attrs/src/reduce.cc
@@ -1,5 +1,5 @@
 #include "op-attrs/ops/reduce.h"
-#include "utils/exception.decl.h"
+#include "utils/exceptions.h"
 
 namespace FlexFlow {
 

From 6f6f61e21cab4a2b30617ce0a22a2b4dbaa24e72 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 15:21:44 +0000
Subject: [PATCH 24/69] add layer norm valid check

---
 lib/op-attrs/src/layer_norm.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/op-attrs/src/layer_norm.cc b/lib/op-attrs/src/layer_norm.cc
index 8a660f733c..4e706cbd28 100644
--- a/lib/op-attrs/src/layer_norm.cc
+++ b/lib/op-attrs/src/layer_norm.cc
@@ -6,6 +6,9 @@ bool LayerNormAttrs::is_valid(ParallelTensorShape const & input) const {
     if(!input.is_valid()) {
         return false;
     }
+    if(input.num_dims() < 2) {
+        return false;
+    }
     return true;
 }
 

From d47198e8c527965e33cfe90e95da262b7a16ff2a Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 11 Oct 2023 13:07:33 +0000
Subject: [PATCH 25/69] add replicate

---
 lib/op-attrs/src/replicate.cc | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/lib/op-attrs/src/replicate.cc b/lib/op-attrs/src/replicate.cc
index 26861d3624..b5e3e81d7e 100644
--- a/lib/op-attrs/src/replicate.cc
+++ b/lib/op-attrs/src/replicate.cc
@@ -1,16 +1,34 @@
 #include "op-attrs/ops/replicate.h"
+#include "op-attrs/parallel_dim.h"
 #include "utils/exception.decl.h"
 
 namespace FlexFlow {
 
 bool ReplicateAttrs::is_valid(ParallelTensorShape const &input) const {
-  NOT_IMPLEMENTED();
+  if(!input.is_valid()) {
+    return false;
+  }
+  if(this->replicate_dim >= input.num_dims() || this->replicate_degree <= 0) {
+    return false;
+  }
+
+  return true;
 }
 
 //replicate by n multiplies degree by n and shape by n
+//seems it is like pytorch's repeat
+//original_tensor = torch.tensor([1, 2, 3]) torch.Size([3])
+///replicated_tensor = original_tensor.repeat(3) torch.Size([9])
+
+//original_tensor = torch.randn(2, 3, 4) torch.Size([2, 3, 4])
+//repeated_tensor = original_tensor.repeat(3, 1, 1) torch.Size([6, 3, 4])
+
 ParallelTensorShape get_output_shape(ReplicateAttrs const & attrs,
                                      ParallelTensorShape const & input) {
-  NOT_IMPLEMENTED();
+  assert(attrs.is_valid(input));
+  ParallelTensorShape output = input;
+  output.at(attrs.replicate_dim).size *= attrs.replicate_degree;
+  return output;
 }
 
 

From ea0297e5856d5a0c043a1ad4e6eaed3ed889a3f3 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 11 Oct 2023 13:19:34 +0000
Subject: [PATCH 26/69] add softmax

---
 lib/op-attrs/include/op-attrs/ops/softmax.h |  4 ++++
 lib/op-attrs/src/softmax.cc                 | 21 ++++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/softmax.h b/lib/op-attrs/include/op-attrs/ops/softmax.h
index 9a776737f5..8e7a00e661 100644
--- a/lib/op-attrs/include/op-attrs/ops/softmax.h
+++ b/lib/op-attrs/include/op-attrs/ops/softmax.h
@@ -10,10 +10,14 @@ namespace FlexFlow {
 
 struct SoftmaxAttrs {
   ff_dim_t dim;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(SoftmaxAttrs, dim);
 CHECK_VALID_OP_ATTR(SoftmaxAttrs);
 
+ParallelTensorShape get_output_shape(SoftmaxAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/softmax.cc b/lib/op-attrs/src/softmax.cc
index 9f95da4fb7..f5795cc037 100644
--- a/lib/op-attrs/src/softmax.cc
+++ b/lib/op-attrs/src/softmax.cc
@@ -1,3 +1,22 @@
 #include "op-attrs/ops/softmax.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool SoftmaxAttrs::is_valid(ParallelTensorShape const &input) const {
+    if(!input.is_valid()) {
+        return false;
+    }
+    if(input.num_dims() < 2) {
+        return false;
+    }
+    return true;
+}
+
+ParallelTensorShape get_output_shape(SoftmaxAttrs const & attrs,
+                                     ParallelTensorShape const & input) {
+    assert(attrs.is_valid(input));
+    ParallelTensorShape output = input;
+    return output;
+}
+
+} // namespace FlexFlow

From 125a9ad5b98c07e0421bdc5e32e2942686002758 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 11 Oct 2023 14:08:12 +0000
Subject: [PATCH 27/69] add split

---
 lib/op-attrs/include/op-attrs/ops/split.h |  3 ++
 lib/op-attrs/src/split.cc                 | 34 ++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/split.h b/lib/op-attrs/include/op-attrs/ops/split.h
index fa66bc46f5..02b5e3b45e 100644
--- a/lib/op-attrs/include/op-attrs/ops/split.h
+++ b/lib/op-attrs/include/op-attrs/ops/split.h
@@ -11,9 +11,12 @@ namespace FlexFlow {
 struct SplitAttrs {
   req<stack_vector<int, MAX_NUM_OUTPUTS>> splits;
   ff_dim_t axis;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(SplitAttrs, splits, axis);
 CHECK_VALID_OP_ATTR(SplitAttrs);
+std::vector<ParallelTensorShape> get_output_shapes(SplitAttrs const &,
+                                                   ParallelTensorShape const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/src/split.cc b/lib/op-attrs/src/split.cc
index acda8f3262..6ba0b711d1 100644
--- a/lib/op-attrs/src/split.cc
+++ b/lib/op-attrs/src/split.cc
@@ -1,3 +1,35 @@
 #include "op-attrs/ops/split.h"
+#include "op-attrs/ff_dim.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool SplitAttrs::is_valid(ParallelTensorShape const & input) const {
+    if(!input.is_valid()) {
+        return false;
+    }
+    std::size_t dims_sum = 0;
+
+    for(std::size_t i = 0; i < this->splits.size(); ++i) {
+        dims_sum += splits[i];
+    }
+
+    if(dims_sum != input.at(ff_dim_t(axis)).size) {
+        return false;
+    }
+    return true;
+}
+
+
+std::vector<ParallelTensorShape> get_output_shapes(SplitAttrs const & attrs,
+                                                   ParallelTensorShape const & input) {
+
+    assert(attrs.is_valid(input));
+    std::vector<ParallelTensorShape> outputs;
+    for(std::size_t i = 0 ; i < attrs.splits.size(); ++i) {
+        outputs.emplace_back(input);
+        outputs.back().at(ff_dim_t(attrs.axis)).size = attrs.splits[i];
+    }
+    return outputs;
+}
+
+} // namespace FlexFlow

From 61e09c650dd6ed6e114b40e7cda321fc09c58aff Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 11 Oct 2023 14:24:58 +0000
Subject: [PATCH 28/69] add topk

---
 lib/op-attrs/include/op-attrs/ops/split.h |  3 +--
 lib/op-attrs/include/op-attrs/ops/topk.h  |  9 ++++++++-
 lib/op-attrs/src/topk.cc                  | 24 ++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/split.h b/lib/op-attrs/include/op-attrs/ops/split.h
index 02b5e3b45e..94a648382f 100644
--- a/lib/op-attrs/include/op-attrs/ops/split.h
+++ b/lib/op-attrs/include/op-attrs/ops/split.h
@@ -7,7 +7,6 @@
 #include "utils/visitable.h"
 
 namespace FlexFlow {
-
 struct SplitAttrs {
   req<stack_vector<int, MAX_NUM_OUTPUTS>> splits;
   ff_dim_t axis;
@@ -17,7 +16,7 @@ FF_VISITABLE_STRUCT(SplitAttrs, splits, axis);
 CHECK_VALID_OP_ATTR(SplitAttrs);
 std::vector<ParallelTensorShape> get_output_shapes(SplitAttrs const &,
                                                    ParallelTensorShape const &);
-
+                                                   
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/topk.h b/lib/op-attrs/include/op-attrs/ops/topk.h
index 413855913c..0db94fcaf4 100644
--- a/lib/op-attrs/include/op-attrs/ops/topk.h
+++ b/lib/op-attrs/include/op-attrs/ops/topk.h
@@ -7,13 +7,20 @@
 
 namespace FlexFlow {
 
+//I think we should add axis 
+//pytorch code: torch.topk(input_tensor, k, largest=True, sorted=True, dim=dim)
 struct TopKAttrs {
   req<int> k;
   req<bool> sorted;
+  req<int> axis;
+  bool is_valid(ParallelTensorShape const &) const;
 };
-FF_VISITABLE_STRUCT(TopKAttrs, k, sorted);
+FF_VISITABLE_STRUCT(TopKAttrs, k, sorted,axis);
 CHECK_VALID_OP_ATTR(TopKAttrs);
 
+ParallelTensorShape get_output_shape(TopKAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/topk.cc b/lib/op-attrs/src/topk.cc
index 9d701e4868..00c2d97902 100644
--- a/lib/op-attrs/src/topk.cc
+++ b/lib/op-attrs/src/topk.cc
@@ -1,3 +1,25 @@
 #include "op-attrs/ops/topk.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool TopKAttrs::is_valid(ParallelTensorShape const & input) const {
+    if(!input.is_valid()) {
+        return false;
+    }
+
+    if(k > input.at(ff_dim_t(axis)).size) {
+        return false;
+    }
+    return true;
+}
+
+
+ParallelTensorShape get_output_shape(TopKAttrs const & attrs,
+                                     ParallelTensorShape const & input) {
+    assert(attrs.is_valid(input));
+    ParallelTensorShape output = input;
+    output.at(ff_dim_t(attrs.axis)).size = attrs.k;
+    return output;
+}
+
+} // namespace FlexFlow

From f3d65246d8094623668f8ba7dd6c50f80f72b3e5 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 11 Oct 2023 14:33:54 +0000
Subject: [PATCH 29/69] add transpose

---
 lib/op-attrs/include/op-attrs/ops/attention.h |  1 -
 .../include/op-attrs/ops/batch_matmul.h       |  3 +-
 .../include/op-attrs/ops/batch_norm.h         |  3 +-
 lib/op-attrs/include/op-attrs/ops/concat.h    |  2 +-
 lib/op-attrs/include/op-attrs/ops/conv_2d.h   |  2 +-
 .../include/op-attrs/ops/element_binary.h     |  4 +-
 lib/op-attrs/include/op-attrs/ops/embedding.h |  2 +-
 lib/op-attrs/include/op-attrs/ops/gather.h    |  3 +-
 lib/op-attrs/include/op-attrs/ops/reshape.h   |  4 +-
 lib/op-attrs/include/op-attrs/ops/reverse.h   |  2 +-
 lib/op-attrs/include/op-attrs/ops/split.h     |  2 +-
 lib/op-attrs/include/op-attrs/ops/topk.h      |  6 +-
 lib/op-attrs/include/op-attrs/ops/transpose.h |  4 ++
 lib/op-attrs/src/attention.cc                 | 30 +++++-----
 lib/op-attrs/src/batch_matmul.cc              | 38 ++++++-------
 lib/op-attrs/src/batch_norm.cc                | 19 ++++---
 lib/op-attrs/src/cast.cc                      |  8 +--
 lib/op-attrs/src/combine.cc                   | 14 ++---
 lib/op-attrs/src/conv_2d.cc                   | 27 +++++----
 lib/op-attrs/src/dropout.cc                   | 14 ++---
 lib/op-attrs/src/element_binary.cc            | 18 +++---
 lib/op-attrs/src/element_unary.cc             | 14 ++---
 lib/op-attrs/src/embedding.cc                 | 24 ++++----
 lib/op-attrs/src/flat.cc                      |  5 +-
 lib/op-attrs/src/gather.cc                    |  9 +--
 lib/op-attrs/src/get_output_shapes.cc         |  2 -
 lib/op-attrs/src/groupby.cc                   | 16 +++---
 lib/op-attrs/src/layer_norm.cc                | 26 ++++-----
 lib/op-attrs/src/linear.cc                    | 28 +++++-----
 lib/op-attrs/src/pool_2d.cc                   | 55 +++++++++++--------
 lib/op-attrs/src/reduce.cc                    | 11 ++--
 lib/op-attrs/src/repartition.cc               |  4 +-
 lib/op-attrs/src/replicate.cc                 | 22 ++++----
 lib/op-attrs/src/reshape.cc                   | 39 ++++++-------
 lib/op-attrs/src/reverse.cc                   | 23 ++++++++
 lib/op-attrs/src/softmax.cc                   | 24 ++++----
 lib/op-attrs/src/split.cc                     | 50 ++++++++---------
 lib/op-attrs/src/topk.cc                      | 29 +++++-----
 lib/op-attrs/src/transpose.cc                 | 40 +++++++++++++-
 39 files changed, 352 insertions(+), 275 deletions(-)
 create mode 100644 lib/op-attrs/src/reverse.cc

diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h
index 79469206a3..670e4018cc 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention.h
+++ b/lib/op-attrs/include/op-attrs/ops/attention.h
@@ -7,7 +7,6 @@
 
 namespace FlexFlow {
 
-
 template <typename TensorType>
 struct MultiHeadAttentionInputs
     : public use_visitable_cmp<MultiHeadAttentionInputs<TensorType>> {
diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
index 6473f923a2..f64b2fd8fb 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
@@ -9,8 +9,7 @@ namespace FlexFlow {
 
 struct BatchMatmulAttrs {
   req<int> a_seq_length_dim, b_seq_length_dim;
-  bool is_valid(ParallelTensorShape const &,
-                                     ParallelTensorShape const &);
+  bool is_valid(ParallelTensorShape const &, ParallelTensorShape const &);
 };
 FF_VISITABLE_STRUCT(BatchMatmulAttrs, a_seq_length_dim, b_seq_length_dim);
 
diff --git a/lib/op-attrs/include/op-attrs/ops/batch_norm.h b/lib/op-attrs/include/op-attrs/ops/batch_norm.h
index 65ab18c33c..c35d7bcd41 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_norm.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_norm.h
@@ -13,7 +13,8 @@ struct BatchNormAttrs {
 };
 FF_VISITABLE_STRUCT(BatchNormAttrs, relu);
 
-ParallelTensorShape get_output_shape(BatchNormAttrs const &, ParallelTensorShape const &);
+ParallelTensorShape get_output_shape(BatchNormAttrs const &,
+                                     ParallelTensorShape const &);
 
 CHECK_VALID_OP_ATTR(BatchNormAttrs);
 
diff --git a/lib/op-attrs/include/op-attrs/ops/concat.h b/lib/op-attrs/include/op-attrs/ops/concat.h
index afe3e0dd8d..84def59066 100644
--- a/lib/op-attrs/include/op-attrs/ops/concat.h
+++ b/lib/op-attrs/include/op-attrs/ops/concat.h
@@ -10,7 +10,7 @@ namespace FlexFlow {
 
 struct ConcatAttrs {
   ff_dim_t axis;
-  bool is_valid(std::vector<ParallelTensorShape> const & input) const;
+  bool is_valid(std::vector<ParallelTensorShape> const &input) const;
 };
 
 ParallelTensorShape get_output_shape(ConcatAttrs const &,
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d.h b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
index c8491877a7..31290d153e 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d.h
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
@@ -14,7 +14,7 @@ struct Conv2DAttrs {
       padding_w, groups;
   req<optional<Activation>> activation;
   req<bool> use_bias;
-  bool is_valid(TensorShape const & input) const;
+  bool is_valid(TensorShape const &input) const;
 };
 
 FF_VISITABLE_STRUCT(Conv2DAttrs,
diff --git a/lib/op-attrs/include/op-attrs/ops/element_binary.h b/lib/op-attrs/include/op-attrs/ops/element_binary.h
index f455333347..c068fcc45c 100644
--- a/lib/op-attrs/include/op-attrs/ops/element_binary.h
+++ b/lib/op-attrs/include/op-attrs/ops/element_binary.h
@@ -14,8 +14,8 @@ struct ElementBinaryAttrs {
   req<DataType> compute_type;
   req<bool> should_broadcast_lhs;
   req<bool> should_broadcast_rhs;
-  bool is_valid(ParallelTensorShape const & lhs,
-                ParallelTensorShape const & rhs) const;
+  bool is_valid(ParallelTensorShape const &lhs,
+                ParallelTensorShape const &rhs) const;
 };
 FF_VISITABLE_STRUCT(ElementBinaryAttrs,
                     type,
diff --git a/lib/op-attrs/include/op-attrs/ops/embedding.h b/lib/op-attrs/include/op-attrs/ops/embedding.h
index c7af920d5a..506b8a6186 100644
--- a/lib/op-attrs/include/op-attrs/ops/embedding.h
+++ b/lib/op-attrs/include/op-attrs/ops/embedding.h
@@ -19,7 +19,7 @@ struct EmbeddingAttrs {
   req<int> num_entries, out_channels;
   req<AggregateOp> aggr;
   req<DataType> data_type;
-  bool is_valid(ParallelTensorShape const & input) const;
+  bool is_valid(ParallelTensorShape const &input) const;
 };
 FF_VISITABLE_STRUCT(EmbeddingAttrs, num_entries, out_channels, aggr, data_type);
 CHECK_VALID_OP_ATTR(EmbeddingAttrs);
diff --git a/lib/op-attrs/include/op-attrs/ops/gather.h b/lib/op-attrs/include/op-attrs/ops/gather.h
index 44ec8b1fd7..1789edf649 100644
--- a/lib/op-attrs/include/op-attrs/ops/gather.h
+++ b/lib/op-attrs/include/op-attrs/ops/gather.h
@@ -15,9 +15,8 @@ struct GatherAttrs {
 FF_VISITABLE_STRUCT(GatherAttrs, dim);
 CHECK_VALID_OP_ATTR(GatherAttrs);
 
-
 std::vector<ParallelTensorShape> get_output_shapes(GatherAttrs const &,
-                                                   ParallelTensorShape const & ,
+                                                   ParallelTensorShape const &,
                                                    ParallelTensorShape const &);
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/include/op-attrs/ops/reshape.h b/lib/op-attrs/include/op-attrs/ops/reshape.h
index cf0eb07a95..7fbe573c93 100644
--- a/lib/op-attrs/include/op-attrs/ops/reshape.h
+++ b/lib/op-attrs/include/op-attrs/ops/reshape.h
@@ -2,15 +2,15 @@
 #define _FLEXFLOW_RESHAPE_ATTRS_H
 
 #include "core.h"
-#include "op-attrs/tensor_shape.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.h"
 #include "utils/visitable.h"
 
 namespace FlexFlow {
 
 struct ReshapeAttrs {
   TensorShape shape;
-  bool is_valid(ParallelTensorShape  const &) const;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ReshapeAttrs, shape);
 CHECK_VALID_OP_ATTR(ReshapeAttrs);
diff --git a/lib/op-attrs/include/op-attrs/ops/reverse.h b/lib/op-attrs/include/op-attrs/ops/reverse.h
index a0bd38b9c9..0c8657c6ec 100644
--- a/lib/op-attrs/include/op-attrs/ops/reverse.h
+++ b/lib/op-attrs/include/op-attrs/ops/reverse.h
@@ -3,8 +3,8 @@
 
 #include "core.h"
 #include "op-attrs/ff_dim.h"
-#include "utils/visitable.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "utils/visitable.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/include/op-attrs/ops/split.h b/lib/op-attrs/include/op-attrs/ops/split.h
index 94a648382f..e2abeb2581 100644
--- a/lib/op-attrs/include/op-attrs/ops/split.h
+++ b/lib/op-attrs/include/op-attrs/ops/split.h
@@ -16,7 +16,7 @@ FF_VISITABLE_STRUCT(SplitAttrs, splits, axis);
 CHECK_VALID_OP_ATTR(SplitAttrs);
 std::vector<ParallelTensorShape> get_output_shapes(SplitAttrs const &,
                                                    ParallelTensorShape const &);
-                                                   
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/topk.h b/lib/op-attrs/include/op-attrs/ops/topk.h
index 0db94fcaf4..914ac1afc2 100644
--- a/lib/op-attrs/include/op-attrs/ops/topk.h
+++ b/lib/op-attrs/include/op-attrs/ops/topk.h
@@ -7,15 +7,15 @@
 
 namespace FlexFlow {
 
-//I think we should add axis 
-//pytorch code: torch.topk(input_tensor, k, largest=True, sorted=True, dim=dim)
+// I think we should add axis
+// pytorch code: torch.topk(input_tensor, k, largest=True, sorted=True, dim=dim)
 struct TopKAttrs {
   req<int> k;
   req<bool> sorted;
   req<int> axis;
   bool is_valid(ParallelTensorShape const &) const;
 };
-FF_VISITABLE_STRUCT(TopKAttrs, k, sorted,axis);
+FF_VISITABLE_STRUCT(TopKAttrs, k, sorted, axis);
 CHECK_VALID_OP_ATTR(TopKAttrs);
 
 ParallelTensorShape get_output_shape(TopKAttrs const &,
diff --git a/lib/op-attrs/include/op-attrs/ops/transpose.h b/lib/op-attrs/include/op-attrs/ops/transpose.h
index 87db435979..461aa0aacb 100644
--- a/lib/op-attrs/include/op-attrs/ops/transpose.h
+++ b/lib/op-attrs/include/op-attrs/ops/transpose.h
@@ -10,10 +10,14 @@ namespace FlexFlow {
 
 struct TransposeAttrs {
   req<stack_vector<ff_dim_t, MAX_TENSOR_DIM>> perm;
+  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(TransposeAttrs, perm);
 CHECK_VALID_OP_ATTR(TransposeAttrs);
 
+ParallelTensorShape get_output_shape(TransposeAttrs const &,
+                                     ParallelTensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index c8148cd45d..a6ae56ddfd 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -11,7 +11,8 @@ namespace FlexFlow {
 /*   return is_valid; */
 /* } */
 
-bool MultiHeadAttentionAttrs::is_valid(MultiHeadAttentionInputs<ParallelTensorShape> const & input) const {
+bool MultiHeadAttentionAttrs::is_valid(
+    MultiHeadAttentionInputs<ParallelTensorShape> const &input) const {
   bool valid = true;
   valid &= input.key.is_valid();
   valid &= input.query.is_valid();
@@ -61,20 +62,22 @@ TensorShape
 
   return {dims, DataType::FLOAT};
 }
-//these two functions are not defined in the attention.h
-// ParallelTensorShape get_output_shape(MultiHeadAttentionAttrs const &attrs,
-//                                      ParallelTensorShape const &query_shape,
-//                                      ParallelTensorShape const &key_shape,
-//                                      ParallelTensorShape const &value_shape) {
-//   /* ParallelDim replica_dim = query_shape.at(ff_dim_t(query_shape.num_dims() -
-//    * 2)); */
-//   /* replica_dim.size = replica_dim.degree; */
+// these two functions are not defined in the attention.h
+//  ParallelTensorShape get_output_shape(MultiHeadAttentionAttrs const &attrs,
+//                                       ParallelTensorShape const &query_shape,
+//                                       ParallelTensorShape const &key_shape,
+//                                       ParallelTensorShape const &value_shape)
+//                                       {
+//    /* ParallelDim replica_dim =
+//    query_shape.at(ff_dim_t(query_shape.num_dims() -
+//     * 2)); */
+//    /* replica_dim.size = replica_dim.degree; */
 
 //   /* ParallelDim */
 
 //   ParallelTensorShape output_shape = query_shape;
-//   output_shape.at(ff_dim_t(output_shape.num_dims() - 1)).size = attrs.embed_dim;
-//   return output_shape;
+//   output_shape.at(ff_dim_t(output_shape.num_dims() - 1)).size =
+//   attrs.embed_dim; return output_shape;
 // }
 
 // TensorShape get_output_shape(MultiHeadAttentionAttrs const &attrs,
@@ -89,8 +92,9 @@ TensorShape
 //   return get_tensor_shape_unsafe(parallel_shape);
 // }
 
-ParallelTensorShape get_output_shape(MultiHeadAttentionAttrs const & attrs,
-                                     MultiHeadAttentionInputs<ParallelTensorShape> const &inputs) {
+ParallelTensorShape get_output_shape(
+    MultiHeadAttentionAttrs const &attrs,
+    MultiHeadAttentionInputs<ParallelTensorShape> const &inputs) {
   ParallelTensorShape output_shape = inputs.query;
   output_shape.at(ff_dim_t(output_shape.num_dims() - 1)).size = attrs.embed_dim;
   return output_shape;
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index 170ee655d2..f30869e035 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -4,30 +4,30 @@
 
 namespace FlexFlow {
 
-//maybe we should add more check here
-bool BatchMatmulAttrs::is_valid(ParallelTensorShape const & lhs, ParallelTensorShape const & rhs) {
-    if (!lhs.is_valid() || !rhs.is_valid()) {
-          return false;
-    }
-    if (lhs.num_dims() != rhs.num_dims()) {
-          return false;
-    }
-    
-    return true;
+// maybe we should add more check here
+bool BatchMatmulAttrs::is_valid(ParallelTensorShape const &lhs,
+                                ParallelTensorShape const &rhs) {
+  if (!lhs.is_valid() || !rhs.is_valid()) {
+    return false;
+  }
+  if (lhs.num_dims() != rhs.num_dims()) {
+    return false;
+  }
+
+  return true;
 }
 
-//how to get the batch size? and lhs: [b, s1, k], rhs: [b, k, s1]
-ParallelTensorShape get_output_shape(BatchMatmulAttrs const & attrs,
-                                     ParallelTensorShape const & lhs,
-                                     ParallelTensorShape const & rhs) {
-  ParallelTensorShape   output_shape = lhs;
+// how to get the batch size? and lhs: [b, s1, k], rhs: [b, k, s1]
+ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
+                                     ParallelTensorShape const &lhs,
+                                     ParallelTensorShape const &rhs) {
+  ParallelTensorShape output_shape = lhs;
   output_shape.at(ff_dim_t(0)).size = lhs.at(ff_dim_t(0)).size;
   output_shape.at(ff_dim_t(1)).size = attrs.a_seq_length_dim;
   output_shape.at(ff_dim_t(2)).size = attrs.b_seq_length_dim;
-  //TODO: Do we need to set the ParallelDim for output_shape
-  return output_shape;  
-}     
-
+  // TODO: Do we need to set the ParallelDim for output_shape
+  return output_shape;
+}
 
 /* bool BatchMatmulAttrs::is_valid( */
 /*     ParallelTensorShape const &lhs, ParallelTensorShape const &rhs) const {
diff --git a/lib/op-attrs/src/batch_norm.cc b/lib/op-attrs/src/batch_norm.cc
index a1123667d2..9b15913d1f 100644
--- a/lib/op-attrs/src/batch_norm.cc
+++ b/lib/op-attrs/src/batch_norm.cc
@@ -1,16 +1,17 @@
 #include "op-attrs/ops/batch_norm.h"
 
 namespace FlexFlow {
- 
- bool BatchNormAttrs::is_valid(ParallelTensorShape const & input) {
-     if(!input.is_valid()) {
-         return false;
-     }
-    return true;
- }
 
-ParallelTensorShape get_output_shape(BatchNormAttrs const & attrs, ParallelTensorShape const & input) {
-    return input; 
+bool BatchNormAttrs::is_valid(ParallelTensorShape const &input) {
+  if (!input.is_valid()) {
+    return false;
+  }
+  return true;
+}
+
+ParallelTensorShape get_output_shape(BatchNormAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  return input;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/cast.cc b/lib/op-attrs/src/cast.cc
index e7dab4689f..d3a6961a2e 100644
--- a/lib/op-attrs/src/cast.cc
+++ b/lib/op-attrs/src/cast.cc
@@ -3,10 +3,10 @@
 namespace FlexFlow {
 
 bool CastAttrs::is_valid(ParallelTensorShape const &input) const {
-    if (!input.is_valid()) {
-        return false;
-    }
-    return true;
+  if (!input.is_valid()) {
+    return false;
+  }
+  return true;
 }
 
 ParallelTensorShape get_output_shape(CastAttrs const &attrs,
diff --git a/lib/op-attrs/src/combine.cc b/lib/op-attrs/src/combine.cc
index 8cfe6dfb8c..7814442926 100644
--- a/lib/op-attrs/src/combine.cc
+++ b/lib/op-attrs/src/combine.cc
@@ -4,17 +4,17 @@
 namespace FlexFlow {
 
 bool CombineAttrs::is_valid(ParallelTensorShape const &input) const {
-    if (!input.is_valid()) {
-        return false;
-    }
-    return true;
+  if (!input.is_valid()) {
+    return false;
+  }
+  return true;
 }
 
-ParallelTensorShape get_output_shape(CombineAttrs const & attrs,
-                                     ParallelTensorShape const & input) {
+ParallelTensorShape get_output_shape(CombineAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
   ParallelTensorShape output = input_shape;
   output.at(attrs.combine_dim).degree /= attrs.combine_degree;
-  return output;                                     
+  return output;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index d5d2ad90c1..42085ca2b1 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -88,19 +88,24 @@ bool Conv2DAttrs::is_valid(TensorShape const &input) const {
   return true;
 }
 
-//according to pytorch, the input shape: [b, input_channel, input_h, input_w]
-//kernel shape: [output_channel, input_channel, kernel_h, kernel_w]
-//we may have stide_h and padding_h
-//output shape: [b, output_channel, output_h, output_w]
-//output_h = (input_h + 2 * padding_h - kernel_h) / stride_h + 1
-//output_w = (input_w + 2 * padding_w - kernel_w) / stride_w + 1
-ParallelTensorShape get_output_shape(Conv2DAttrs const & attrs,
-                                     ParallelTensorShape const & input) {
+// according to pytorch, the input shape: [b, input_channel, input_h, input_w]
+// kernel shape: [output_channel, input_channel, kernel_h, kernel_w]
+// we may have stide_h and padding_h
+// output shape: [b, output_channel, output_h, output_w]
+// output_h = (input_h + 2 * padding_h - kernel_h) / stride_h + 1
+// output_w = (input_w + 2 * padding_w - kernel_w) / stride_w + 1
+ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
   ParallelTensorShape output = input;
   output.at(ff_dim_t(1)).size = attrs.out_channels;
-  output.at(ff_dim_t(2)).size = (input.at(ff_dim_t(2)).size +
-                                 2 * attrs.padding_h - attrs.kernel_h) / attrs.stride_h + 1;
-  output.at(ff_dim_t(3)).size = (input.at(ff_dim_t(3)).size + 2 * attrs.padding_w - attrs.kernel_w) /attrs.stride_w +1;
+  output.at(ff_dim_t(2)).size =
+      (input.at(ff_dim_t(2)).size + 2 * attrs.padding_h - attrs.kernel_h) /
+          attrs.stride_h +
+      1;
+  output.at(ff_dim_t(3)).size =
+      (input.at(ff_dim_t(3)).size + 2 * attrs.padding_w - attrs.kernel_w) /
+          attrs.stride_w +
+      1;
   return output;
 }
 
diff --git a/lib/op-attrs/src/dropout.cc b/lib/op-attrs/src/dropout.cc
index dc60d8cf94..bccfdb10a2 100644
--- a/lib/op-attrs/src/dropout.cc
+++ b/lib/op-attrs/src/dropout.cc
@@ -1,13 +1,13 @@
 #include "dropout.h"
 #include "op-attrs/get_output_shapes.h"
 
-namespace FlexFlow {    
+namespace FlexFlow {
 
-bool DropoutAttrs::is_valid(ParallelTensorShape const & input) const {
-    if(!input.is_valid()) {
-        return false;
-    }
-    return true;
+bool DropoutAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  return true;
 }
 
 ParallelTensorShape get_output_shape(DropoutAttrs const &attrs,
@@ -16,4 +16,4 @@ ParallelTensorShape get_output_shape(DropoutAttrs const &attrs,
   return output;
 }
 
-} // namespace FlexFlow
\ No newline at end of file
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/element_binary.cc b/lib/op-attrs/src/element_binary.cc
index 2e0b8f8e34..4b20ee25a9 100644
--- a/lib/op-attrs/src/element_binary.cc
+++ b/lib/op-attrs/src/element_binary.cc
@@ -2,17 +2,17 @@
 
 namespace FlexFlow {
 
-bool ElementBinaryAttrs::is_valid(ParallelTensorShape const & input1,
-                                  ParallelTensorShape const & input2) const {
-    if(!input1.is_valid() || !input2.is_valid()) {
-        return false;
-    }
-    return true;
+bool ElementBinaryAttrs::is_valid(ParallelTensorShape const &input1,
+                                  ParallelTensorShape const &input2) const {
+  if (!input1.is_valid() || !input2.is_valid()) {
+    return false;
+  }
+  return true;
 }
 
-ParallelTensorShape get_output_shape(ElementBinaryAttrs const & atts,
-                                     ParallelTensorShape const & lhs, 
-                                     ParallelTensorShape const & rhs) {
+ParallelTensorShape get_output_shape(ElementBinaryAttrs const &atts,
+                                     ParallelTensorShape const &lhs,
+                                     ParallelTensorShape const &rhs) {
   ParallelTensorShape output = lhs.num_dims() >= rhs.num_dims() ? lhs : rhs;
   for (int i = 0; i < output.num_dims(); i++) {
     if (i >= lhs.num_dims()) {
diff --git a/lib/op-attrs/src/element_unary.cc b/lib/op-attrs/src/element_unary.cc
index b59ba92529..1fd11abe05 100644
--- a/lib/op-attrs/src/element_unary.cc
+++ b/lib/op-attrs/src/element_unary.cc
@@ -2,15 +2,15 @@
 
 namespace FlexFlow {
 
-bool ElementUnaryAttrs::is_valid(ParallelTensorShape const & input) const {
-    if(!input.is_valid()) {
-        return false;
-    }
-    return true;
+bool ElementUnaryAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  return true;
 }
 
-ParallelTensorShape get_output_shape(ElementUnaryAttrs const & atts,
-                                     ParallelTensorShape const & input) {
+ParallelTensorShape get_output_shape(ElementUnaryAttrs const &atts,
+                                     ParallelTensorShape const &input) {
   ParallelTensorShape output = input;
   return output;
 
diff --git a/lib/op-attrs/src/embedding.cc b/lib/op-attrs/src/embedding.cc
index b782e1282c..dca6e393ef 100644
--- a/lib/op-attrs/src/embedding.cc
+++ b/lib/op-attrs/src/embedding.cc
@@ -2,21 +2,21 @@
 
 namespace FlexFlow {
 
-bool EmbeddingAttrs::is_valid(ParallelTensorShape const & input) const {
-    if(!input.is_valid()) {
-        return false;
-    }
-    return true;
+bool EmbeddingAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  return true;
 }
 
-//pytorch nn.Embedding
-//Embedding OP: (num_embeddings, embedding_dim) (num_entries, out_channels)
-//Input: (batch_size, seq_len)
-//Output: (batch_size, seq_len, embedding_dim)
-ParallelTensorShape get_output_shape(EmbeddingAttrs const & atts,
-                                     ParallelTensorShape const & input) {
+// pytorch nn.Embedding
+// Embedding OP: (num_embeddings, embedding_dim) (num_entries, out_channels)
+// Input: (batch_size, seq_len)
+// Output: (batch_size, seq_len, embedding_dim)
+ParallelTensorShape get_output_shape(EmbeddingAttrs const &atts,
+                                     ParallelTensorShape const &input) {
   ParallelTensorShape output = input;
   output.at(ff_dim_t(1)).size = input.at(ff_dim_t(1)).size;
-  output.at(ff_dim_t(2)).size= atts.out_channels;
+  output.at(ff_dim_t(2)).size = atts.out_channels;
   return output;
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/flat.cc b/lib/op-attrs/src/flat.cc
index 65da5be1f0..ae351328b7 100644
--- a/lib/op-attrs/src/flat.cc
+++ b/lib/op-attrs/src/flat.cc
@@ -14,8 +14,9 @@ namespace Output {
 constexpr int NUMDIM = 3, CHANNEL = 0, SAMPLE = 1, REPLICA = 2;
 }
 
-//flat is like the pytorch view 
-//tensor = torch.randn(2, 3, 4)  ,flattened_tensor = tensor.view(-1) #shape: (24) 
+// flat is like the pytorch view
+// tensor = torch.randn(2, 3, 4)  ,flattened_tensor = tensor.view(-1) #shape:
+// (24)
 ParallelTensorShape get_output_shape(FlatAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   ParallelTensorShape output_shape(input.dims, input.data_type);
diff --git a/lib/op-attrs/src/gather.cc b/lib/op-attrs/src/gather.cc
index d514b439d4..25bfe8e516 100644
--- a/lib/op-attrs/src/gather.cc
+++ b/lib/op-attrs/src/gather.cc
@@ -17,10 +17,11 @@ bool GatherAttrs::is_valid(ParallelTensorShape const &lhs,
   return true;
 }
 
-//todo: why return a vector?
-std::vector<ParallelTensorShape> get_output_shapes(GatherAttrs const & attrs,
-                                                   ParallelTensorShape const & lhs,
-                                                   ParallelTensorShape const & rhs ) {
+// todo: why return a vector?
+std::vector<ParallelTensorShape>
+    get_output_shapes(GatherAttrs const &attrs,
+                      ParallelTensorShape const &lhs,
+                      ParallelTensorShape const &rhs) {
   NOT_IMPLEMENTED();
 }
 
diff --git a/lib/op-attrs/src/get_output_shapes.cc b/lib/op-attrs/src/get_output_shapes.cc
index b41912d577..f44a677873 100644
--- a/lib/op-attrs/src/get_output_shapes.cc
+++ b/lib/op-attrs/src/get_output_shapes.cc
@@ -20,6 +20,4 @@ TensorShape get_output_shape(AggregateAttrs const &attrs,
                        as_parallel(exp_preds)));
 }
 
-
-
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/groupby.cc b/lib/op-attrs/src/groupby.cc
index efe22e2a25..9315b85c39 100644
--- a/lib/op-attrs/src/groupby.cc
+++ b/lib/op-attrs/src/groupby.cc
@@ -5,16 +5,16 @@ namespace FlexFlow {
 
 bool Group_byAttrs::is_valid(ParallelTensorShape const &lhs,
                              ParallelTensorShape const &rhs) const {
-    if(!lhs.is_valid() || !rhs.is_valid()) {
-        return false;
-    }
-   NOT_IMPLEMENTED();
+  if (!lhs.is_valid() || !rhs.is_valid()) {
+    return false;
+  }
+  NOT_IMPLEMENTED();
 }
 
-ParallelTensorShape get_output_shape(Group_byAttrs const & attrs,
-                                     ParallelTensorShape const & lhs,
-                                     ParallelTensorShape const & rhs) {
-    NOT_IMPLEMENTED();
+ParallelTensorShape get_output_shape(Group_byAttrs const &attrs,
+                                     ParallelTensorShape const &lhs,
+                                     ParallelTensorShape const &rhs) {
+  NOT_IMPLEMENTED();
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/layer_norm.cc b/lib/op-attrs/src/layer_norm.cc
index 4e706cbd28..081252847a 100644
--- a/lib/op-attrs/src/layer_norm.cc
+++ b/lib/op-attrs/src/layer_norm.cc
@@ -2,21 +2,21 @@
 
 namespace FlexFlow {
 
-bool LayerNormAttrs::is_valid(ParallelTensorShape const & input) const {
-    if(!input.is_valid()) {
-        return false;
-    }
-    if(input.num_dims() < 2) {
-        return false;
-    }
-    return true;
+bool LayerNormAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  if (input.num_dims() < 2) {
+    return false;
+  }
+  return true;
 }
 
-//todo: maybe we need to set the degree of parallel_dim
-ParallelTensorShape get_output_shape(LayerNormAttrs const & attrs,
-                                     ParallelTensorShape const & input) {
-    ParallelTensorShape output = input;
-    return output;
+// todo: maybe we need to set the degree of parallel_dim
+ParallelTensorShape get_output_shape(LayerNormAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  ParallelTensorShape output = input;
+  return output;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/linear.cc b/lib/op-attrs/src/linear.cc
index 3aa361c342..bae30a8ebd 100644
--- a/lib/op-attrs/src/linear.cc
+++ b/lib/op-attrs/src/linear.cc
@@ -3,22 +3,22 @@
 
 namespace FlexFlow {
 
-bool LinearAttrs::is_valid(ParallelTensorShape const & input) const {
-    if(!input.is_valid()) {
-        return false;
-    }
-    return true;
+bool LinearAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  return true;
 }
 
-//pytorch: input shape:{batch_size, input_channels}
-//pytorch linearattrs: should be {input_channels, output_channels} 
-//pytorch: output shape:{batch_size, output_channels}
-//question: the Linearattrs doesn't have input_channels
-ParallelTensorShape get_output_shape(LinearAttrs const & atts,
-                                     ParallelTensorShape const & input) {
-    ParallelTensorShape out_shape = input;
-    out_shape.at(ff_dim_t(0)).size = atts.out_channels;
-    return out_shape;
+// pytorch: input shape:{batch_size, input_channels}
+// pytorch linearattrs: should be {input_channels, output_channels}
+// pytorch: output shape:{batch_size, output_channels}
+// question: the Linearattrs doesn't have input_channels
+ParallelTensorShape get_output_shape(LinearAttrs const &atts,
+                                     ParallelTensorShape const &input) {
+  ParallelTensorShape out_shape = input;
+  out_shape.at(ff_dim_t(0)).size = atts.out_channels;
+  return out_shape;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/pool_2d.cc b/lib/op-attrs/src/pool_2d.cc
index 8587b114a6..6d58210b6a 100644
--- a/lib/op-attrs/src/pool_2d.cc
+++ b/lib/op-attrs/src/pool_2d.cc
@@ -40,32 +40,39 @@ static ParallelDimMappingSolution
   return solve_parallel_dim_mappings(construct_mappings(input), {input}, 0, 1);
 }
 
-bool Pool2DAttrs::is_valid(ParallelTensorShape const & input) const {
-    if(!input.is_valid()) {
-        return false;
-    }
-    return true;
+bool Pool2DAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  return true;
 }
 
-//pytorch: we have two type of pool2d, maxpool2d and avgpool2d
-//input shape: (batch_size, channels, input_height, input_width)
-//for avgpool2d, output shape: (batch_size, channels, 1, 1)
-//for maxpool2d, output shape: (batch_size, channels, output_height, output_width)
-//output_height = (input_height + 2 * padding_h - kernel_h) / stride_h + 1
-//output_width = (input_width + 2 * padding_w - kernel_w) / stride_w + 1
-ParallelTensorShape get_output_shape(Pool2DAttrs const & attrs,
-                                     ParallelTensorShape const & input) {
-    ParallelTensorShape output_shape = input;    
-    if(attrs.pool_type == PoolOp::AVG) {
-      output_shape.at(ff_dim_t(2)).size = 1;
-      output_shape.at(ff_dim_t(3)).size = 1;
-    } else if(attrs.pool_type == PoolOp::MAX) {
-      output_shape.at(ff_dim_t(2)).size = (input.at(ff_dim_t(2)).size + 2 * attrs.padding_h - attrs.kernel_h) / attrs.stride_h + 1;
-      output_shape.at(ff_dim_t(3)).size = (input.at(ff_dim_t(3)).size + 2 * attrs.padding_w - attrs.kernel_w) / attrs.stride_w + 1;
-    } else {
-      assert(false && "unsupported pool type");
-    }
-    return output_shape;                                
+// pytorch: we have two type of pool2d, maxpool2d and avgpool2d
+// input shape: (batch_size, channels, input_height, input_width)
+// for avgpool2d, output shape: (batch_size, channels, 1, 1)
+// for maxpool2d, output shape: (batch_size, channels, output_height,
+// output_width) output_height = (input_height + 2 * padding_h - kernel_h) /
+// stride_h + 1 output_width = (input_width + 2 * padding_w - kernel_w) /
+// stride_w + 1
+ParallelTensorShape get_output_shape(Pool2DAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  ParallelTensorShape output_shape = input;
+  if (attrs.pool_type == PoolOp::AVG) {
+    output_shape.at(ff_dim_t(2)).size = 1;
+    output_shape.at(ff_dim_t(3)).size = 1;
+  } else if (attrs.pool_type == PoolOp::MAX) {
+    output_shape.at(ff_dim_t(2)).size =
+        (input.at(ff_dim_t(2)).size + 2 * attrs.padding_h - attrs.kernel_h) /
+            attrs.stride_h +
+        1;
+    output_shape.at(ff_dim_t(3)).size =
+        (input.at(ff_dim_t(3)).size + 2 * attrs.padding_w - attrs.kernel_w) /
+            attrs.stride_w +
+        1;
+  } else {
+    assert(false && "unsupported pool type");
+  }
+  return output_shape;
 }
 
 }
diff --git a/lib/op-attrs/src/reduce.cc b/lib/op-attrs/src/reduce.cc
index 2c42e5cfad..3deb33e680 100644
--- a/lib/op-attrs/src/reduce.cc
+++ b/lib/op-attrs/src/reduce.cc
@@ -3,13 +3,12 @@
 
 namespace FlexFlow {
 
-bool ReduceAttrs::is_valid(ParallelTensorShape const & input) const {
-    NOT_IMPLEMENTED()
-}
+bool ReduceAttrs::is_valid(ParallelTensorShape const &input) const {
+    NOT_IMPLEMENTED()}
 
-ParallelTensorShape get_output_shape(ReduceAttrs const & attrs,
-                                     ParallelTensorShape const & input) {
-    NOT_IMPLEMENTED()
+ParallelTensorShape get_output_shape(ReduceAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  NOT_IMPLEMENTED()
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/repartition.cc b/lib/op-attrs/src/repartition.cc
index ad037b7cf6..b5a0280d85 100644
--- a/lib/op-attrs/src/repartition.cc
+++ b/lib/op-attrs/src/repartition.cc
@@ -17,7 +17,8 @@ bool RepartitionAttrs::is_valid(ParallelTensorShape const &input) const {
   return (dim.size % this->repartition_degree * dim.degree == 0);
 }
 
-//this may be wrong partition by n multiplies degree by n and keeps shape the same
+// this may be wrong partition by n multiplies degree by n and keeps shape the
+// same
 ParallelTensorShape get_output_shape(RepartitionAttrs const &attrs,
                                      ParallelTensorShape const &input_shape) {
   ParallelTensorShape output(input_shape.dims, input_shape.data_type);
@@ -25,5 +26,4 @@ ParallelTensorShape get_output_shape(RepartitionAttrs const &attrs,
   return output;
 }
 
-
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/replicate.cc b/lib/op-attrs/src/replicate.cc
index b5e3e81d7e..2086ab41bd 100644
--- a/lib/op-attrs/src/replicate.cc
+++ b/lib/op-attrs/src/replicate.cc
@@ -5,32 +5,30 @@
 namespace FlexFlow {
 
 bool ReplicateAttrs::is_valid(ParallelTensorShape const &input) const {
-  if(!input.is_valid()) {
+  if (!input.is_valid()) {
     return false;
   }
-  if(this->replicate_dim >= input.num_dims() || this->replicate_degree <= 0) {
+  if (this->replicate_dim >= input.num_dims() || this->replicate_degree <= 0) {
     return false;
   }
 
   return true;
 }
 
-//replicate by n multiplies degree by n and shape by n
-//seems it is like pytorch's repeat
-//original_tensor = torch.tensor([1, 2, 3]) torch.Size([3])
-///replicated_tensor = original_tensor.repeat(3) torch.Size([9])
+// replicate by n multiplies degree by n and shape by n
+// seems it is like pytorch's repeat
+// original_tensor = torch.tensor([1, 2, 3]) torch.Size([3])
+/// replicated_tensor = original_tensor.repeat(3) torch.Size([9])
 
-//original_tensor = torch.randn(2, 3, 4) torch.Size([2, 3, 4])
-//repeated_tensor = original_tensor.repeat(3, 1, 1) torch.Size([6, 3, 4])
+// original_tensor = torch.randn(2, 3, 4) torch.Size([2, 3, 4])
+// repeated_tensor = original_tensor.repeat(3, 1, 1) torch.Size([6, 3, 4])
 
-ParallelTensorShape get_output_shape(ReplicateAttrs const & attrs,
-                                     ParallelTensorShape const & input) {
+ParallelTensorShape get_output_shape(ReplicateAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
   assert(attrs.is_valid(input));
   ParallelTensorShape output = input;
   output.at(attrs.replicate_dim).size *= attrs.replicate_degree;
   return output;
 }
 
-
-
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/reshape.cc b/lib/op-attrs/src/reshape.cc
index 777f2eef1b..e100efeadb 100644
--- a/lib/op-attrs/src/reshape.cc
+++ b/lib/op-attrs/src/reshape.cc
@@ -3,32 +3,33 @@
 
 namespace FlexFlow {
 
-//pytorch: the input: [2,3,4], shape maybe [-1,6]， should we add this? and the output is [4, 6]
+// pytorch: the input: [2,3,4], shape maybe [-1,6]， should we add this? and the
+// output is [4, 6]
 bool ReshapeAttrs::is_valid(ParallelTensorShape const &input) const {
   if (!input.is_valid()) {
     return false;
   }
-  std::size_t input_volume =1;
-    for (int i = 0; i < input.num_dims(); i++) {
-        input_volume *= input.at(ff_dim_t(i)).size;
-    }
-  std::size_t attrs_volume =1;
-    for (int i = 0; i < this->shape.dims.num_dims(); i++) {
-        attrs_volume *=  this->shape.at(ff_dim_t(i));
-    }
-    return (input_volume == attrs_volume);
+  std::size_t input_volume = 1;
+  for (int i = 0; i < input.num_dims(); i++) {
+    input_volume *= input.at(ff_dim_t(i)).size;
+  }
+  std::size_t attrs_volume = 1;
+  for (int i = 0; i < this->shape.dims.num_dims(); i++) {
+    attrs_volume *= this->shape.at(ff_dim_t(i));
+  }
+  return (input_volume == attrs_volume);
 }
 
-//pytorch: the input: [2,3,4], shape maybe [-1,6]， should we add this? and the output is [4, 6]
-//currently we doesn't consider the case of -1,we can support this later
-//the input:[2,3,4], attrs.shape:[4,6], the output is [4, 6]
-ParallelTensorShape get_output_shape(ReshapeAttrs const & attrs,
-                                     ParallelTensorShape const & input) {
+// pytorch: the input: [2,3,4], shape maybe [-1,6]， should we add this? and the
+// output is [4, 6] currently we doesn't consider the case of -1,we can support
+// this later the input:[2,3,4], attrs.shape:[4,6], the output is [4, 6]
+ParallelTensorShape get_output_shape(ReshapeAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
 
-    assert(attrs.is_valid(input) && "input is not valid");
-    ParallelTensorDims dims{attrs.shape.dims};
-    ParallelTensorShape output{dims, input.data_type};
-    return output;
+  assert(attrs.is_valid(input) && "input is not valid");
+  ParallelTensorDims dims{attrs.shape.dims};
+  ParallelTensorShape output{dims, input.data_type};
+  return output;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/reverse.cc b/lib/op-attrs/src/reverse.cc
new file mode 100644
index 0000000000..0af3806778
--- /dev/null
+++ b/lib/op-attrs/src/reverse.cc
@@ -0,0 +1,23 @@
+#include "op-attrs/ops/reverse.h"
+#include "op-attrs/ff_dim.h"
+
+namespace FlexFlow {
+
+bool ReverseAttrs::is_valid(ParallelTensorShape const & input) const {
+    if(input.is_valid() ==false) {
+        return false;
+    }
+    if(this->axis < 0 || this->axis >= input.num_dims()) {
+        return false;
+    }
+    return true;
+}
+
+ParallelTensorShape get_output_shape(ReverseAttrs const & attrs, 
+                                     ParallelTensorShape const & input) {
+    ParallelTensorShape output = input;
+    return output;
+}
+
+
+};
\ No newline at end of file
diff --git a/lib/op-attrs/src/softmax.cc b/lib/op-attrs/src/softmax.cc
index f5795cc037..91d6555681 100644
--- a/lib/op-attrs/src/softmax.cc
+++ b/lib/op-attrs/src/softmax.cc
@@ -3,20 +3,20 @@
 namespace FlexFlow {
 
 bool SoftmaxAttrs::is_valid(ParallelTensorShape const &input) const {
-    if(!input.is_valid()) {
-        return false;
-    }
-    if(input.num_dims() < 2) {
-        return false;
-    }
-    return true;
+  if (!input.is_valid()) {
+    return false;
+  }
+  if (input.num_dims() < 2) {
+    return false;
+  }
+  return true;
 }
 
-ParallelTensorShape get_output_shape(SoftmaxAttrs const & attrs,
-                                     ParallelTensorShape const & input) {
-    assert(attrs.is_valid(input));
-    ParallelTensorShape output = input;
-    return output;
+ParallelTensorShape get_output_shape(SoftmaxAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  assert(attrs.is_valid(input));
+  ParallelTensorShape output = input;
+  return output;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/split.cc b/lib/op-attrs/src/split.cc
index 6ba0b711d1..1c14f1c370 100644
--- a/lib/op-attrs/src/split.cc
+++ b/lib/op-attrs/src/split.cc
@@ -3,33 +3,33 @@
 
 namespace FlexFlow {
 
-bool SplitAttrs::is_valid(ParallelTensorShape const & input) const {
-    if(!input.is_valid()) {
-        return false;
-    }
-    std::size_t dims_sum = 0;
-
-    for(std::size_t i = 0; i < this->splits.size(); ++i) {
-        dims_sum += splits[i];
-    }
-
-    if(dims_sum != input.at(ff_dim_t(axis)).size) {
-        return false;
-    }
-    return true;
+bool SplitAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  std::size_t dims_sum = 0;
+
+  for (std::size_t i = 0; i < this->splits.size(); ++i) {
+    dims_sum += splits[i];
+  }
+
+  if (dims_sum != input.at(ff_dim_t(axis)).size) {
+    return false;
+  }
+  return true;
 }
 
-
-std::vector<ParallelTensorShape> get_output_shapes(SplitAttrs const & attrs,
-                                                   ParallelTensorShape const & input) {
-
-    assert(attrs.is_valid(input));
-    std::vector<ParallelTensorShape> outputs;
-    for(std::size_t i = 0 ; i < attrs.splits.size(); ++i) {
-        outputs.emplace_back(input);
-        outputs.back().at(ff_dim_t(attrs.axis)).size = attrs.splits[i];
-    }
-    return outputs;
+std::vector<ParallelTensorShape>
+    get_output_shapes(SplitAttrs const &attrs,
+                      ParallelTensorShape const &input) {
+
+  assert(attrs.is_valid(input));
+  std::vector<ParallelTensorShape> outputs;
+  for (std::size_t i = 0; i < attrs.splits.size(); ++i) {
+    outputs.emplace_back(input);
+    outputs.back().at(ff_dim_t(attrs.axis)).size = attrs.splits[i];
+  }
+  return outputs;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/topk.cc b/lib/op-attrs/src/topk.cc
index 00c2d97902..06c43b3eba 100644
--- a/lib/op-attrs/src/topk.cc
+++ b/lib/op-attrs/src/topk.cc
@@ -2,24 +2,23 @@
 
 namespace FlexFlow {
 
-bool TopKAttrs::is_valid(ParallelTensorShape const & input) const {
-    if(!input.is_valid()) {
-        return false;
-    }
+bool TopKAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
 
-    if(k > input.at(ff_dim_t(axis)).size) {
-        return false;
-    }
-    return true;
+  if (k > input.at(ff_dim_t(axis)).size) {
+    return false;
+  }
+  return true;
 }
 
-
-ParallelTensorShape get_output_shape(TopKAttrs const & attrs,
-                                     ParallelTensorShape const & input) {
-    assert(attrs.is_valid(input));
-    ParallelTensorShape output = input;
-    output.at(ff_dim_t(attrs.axis)).size = attrs.k;
-    return output;
+ParallelTensorShape get_output_shape(TopKAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  assert(attrs.is_valid(input));
+  ParallelTensorShape output = input;
+  output.at(ff_dim_t(attrs.axis)).size = attrs.k;
+  return output;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/transpose.cc b/lib/op-attrs/src/transpose.cc
index ad4a84a3d5..97140c6b49 100644
--- a/lib/op-attrs/src/transpose.cc
+++ b/lib/op-attrs/src/transpose.cc
@@ -1,3 +1,41 @@
 #include "op-attrs/ops/transpose.h"
+#include "op-attrs/ff_dim.h"
+#include "utils/exception.decl.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+bool TransposeAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (!input.is_valid()) {
+    return false;
+  }
+  // in pytorch, we use choose two dim for transpose, so I think the size of
+  // perm should be 2
+  if (perm.size() != 2) {
+    return false;
+  }
+
+  auto dim0 = perm[0];
+  auto dim1 = perm[1];
+  if (dim0 < 0 || dim1 < 0 || dim0 >= input.num_dims() ||
+      dim1 >= input.num_dims()) {
+    return false;
+  }
+
+  return true;
+}
+
+// assume we have [x, y, z, l], perms is [0,2] we return [z, y, x, l]
+ParallelTensorShape get_output_shape(TransposeAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  ParallelTensorShape output = input;
+  auto dim0 = attrs.perm[0];
+  auto dim1 = attrs.perm[1];
+  int temp = input.at(ff_dim_t(dim0)).size;
+  output.at(ff_dim_t(dim0)).size = input.at(ff_dim_t(dim1)).size;
+  output.at(ff_dim_t(dim1)).size = temp;
+  return output;
+}
+
+}
+
+} // namespace FlexFlow

From 590dac5b72273cc8cc5b66badec10941a606d7de Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 11 Oct 2023 15:29:11 +0000
Subject: [PATCH 30/69] update the batch matmul

---
 .../src/cuda/aggregate_spec_kernels.cu        | 152 +++++++++---------
 lib/op-attrs/src/batch_matmul.cc              |  16 +-
 2 files changed, 90 insertions(+), 78 deletions(-)

diff --git a/lib/kernels/src/cuda/aggregate_spec_kernels.cu b/lib/kernels/src/cuda/aggregate_spec_kernels.cu
index 8a39b7f558..d46dc64567 100644
--- a/lib/kernels/src/cuda/aggregate_spec_kernels.cu
+++ b/lib/kernels/src/cuda/aggregate_spec_kernels.cu
@@ -30,82 +30,6 @@ AggregateSpecPerDeviceState::~AggregateSpecPerDeviceState(void) {
 namespace Kernels {
 namespace AggregateSpec {
 
-void forward_kernel(cudaStream_t stream,
-                    AggregateSpecPerDeviceState const *m,
-                    float **exp_preds,
-                    int const *acc_gate_assign_ptr,
-                    float *acc_output_ptr,
-                    int n,
-                    int const k,
-                    int rows,
-                    int const batch_size,
-                    int out_dim) {
-
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-
-  // call forward kernel
-  cudaMemcpy(m->dev_region_ptrs,
-             exp_preds,
-             n * sizeof(float *),
-             cudaMemcpyHostToDevice);
-
-  aggspec_forward_kernel<<<GET_BLOCKS(batch_size * k * out_dim),
-                           min(CUDA_NUM_THREADS,
-                               (int)(batch_size * k * out_dim)),
-                           0,
-                           stream>>>(m->dev_region_ptrs,
-                                     acc_gate_assign_ptr,
-                                     acc_output_ptr,
-                                     n,
-                                     k,
-                                     rows,
-                                     batch_size,
-                                     out_dim);
-}
-
-void backward_kernel(cudaStream_t stream,
-                     AggregateSpecPerDeviceState const *m,
-                     float **exp_grads,
-                     int const *acc_gate_assign_ptr,
-                     int const *acc_true_gate_assign_ptr,
-                     float const *acc_gate_pred_ptr,
-                     float *acc_full_gate_grad_ptr,
-                     float const *acc_output_grad_ptr,
-                     int n,
-                     int const k,
-                     int rows,
-                     float lambda_bal,
-                     int const batch_size,
-                     int out_dim) {
-
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-
-  // call backward kernel
-  cudaMemcpy(m->dev_region_ptrs,
-             exp_grads,
-             n * sizeof(float *),
-             cudaMemcpyHostToDevice);
-
-  aggspec_backward_kernel<<<GET_BLOCKS(batch_size * k * out_dim),
-                            min(CUDA_NUM_THREADS,
-                                (int)(batch_size * k * out_dim)),
-                            0,
-                            stream>>>(m->dev_region_ptrs,
-                                      acc_gate_assign_ptr,
-                                      acc_true_gate_assign_ptr,
-                                      acc_gate_pred_ptr,
-                                      acc_full_gate_grad_ptr,
-                                      acc_output_grad_ptr,
-                                      n,
-                                      k,
-                                      rows,
-                                      lambda_bal,
-                                      batch_size,
-                                      out_dim);
-}
-
 __global__ void
     aggspec_forward_kernel(float **exp_preds,
                            int const *exp_assign,
@@ -297,6 +221,82 @@ __global__ void
                                out_dim);
 }
 
+void forward_kernel(cudaStream_t stream,
+                    AggregateSpecPerDeviceState const *m,
+                    float **exp_preds,
+                    int const *acc_gate_assign_ptr,
+                    float *acc_output_ptr,
+                    int n,
+                    int const k,
+                    int rows,
+                    int const batch_size,
+                    int out_dim) {
+
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+
+  // call forward kernel
+  cudaMemcpy(m->dev_region_ptrs,
+             exp_preds,
+             n * sizeof(float *),
+             cudaMemcpyHostToDevice);
+
+  aggspec_forward_kernel<<<GET_BLOCKS(batch_size * k * out_dim),
+                           min(CUDA_NUM_THREADS,
+                               (int)(batch_size * k * out_dim)),
+                           0,
+                           stream>>>(m->dev_region_ptrs,
+                                     acc_gate_assign_ptr,
+                                     acc_output_ptr,
+                                     n,
+                                     k,
+                                     rows,
+                                     batch_size,
+                                     out_dim);
+}
+
+void backward_kernel(cudaStream_t stream,
+                     AggregateSpecPerDeviceState const *m,
+                     float **exp_grads,
+                     int const *acc_gate_assign_ptr,
+                     int const *acc_true_gate_assign_ptr,
+                     float const *acc_gate_pred_ptr,
+                     float *acc_full_gate_grad_ptr,
+                     float const *acc_output_grad_ptr,
+                     int n,
+                     int const k,
+                     int rows,
+                     float lambda_bal,
+                     int const batch_size,
+                     int out_dim) {
+
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+
+  // call backward kernel
+  cudaMemcpy(m->dev_region_ptrs,
+             exp_grads,
+             n * sizeof(float *),
+             cudaMemcpyHostToDevice);
+
+  aggspec_backward_kernel<<<GET_BLOCKS(batch_size * k * out_dim),
+                            min(CUDA_NUM_THREADS,
+                                (int)(batch_size * k * out_dim)),
+                            0,
+                            stream>>>(m->dev_region_ptrs,
+                                      acc_gate_assign_ptr,
+                                      acc_true_gate_assign_ptr,
+                                      acc_gate_pred_ptr,
+                                      acc_full_gate_grad_ptr,
+                                      acc_output_grad_ptr,
+                                      n,
+                                      k,
+                                      rows,
+                                      lambda_bal,
+                                      batch_size,
+                                      out_dim);
+}
+
 } // namespace AggregateSpec
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index f30869e035..e8842c7722 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -10,14 +10,26 @@ bool BatchMatmulAttrs::is_valid(ParallelTensorShape const &lhs,
   if (!lhs.is_valid() || !rhs.is_valid()) {
     return false;
   }
-  if (lhs.num_dims() != rhs.num_dims()) {
+  
+  if(lhs.at(ff_dim_t(0)).size != rhs.at(ff_dim_t(0)).size) {
+    return false;
+  }
+  if(lhs.at(ff_dim_t(2)).size != rhs.at(ff_dim_t(1)).size) {
+    return false;
+  }
+  if(lhs.at(ff_dim_t(1)).size != a_seq_length_dim) {
+    return false;
+  }
+
+  if(rhs.at(ff_dim_t(2)).size != b_seq_length_dim) {
     return false;
   }
 
   return true;
 }
 
-// how to get the batch size? and lhs: [b, s1, k], rhs: [b, k, s1]
+// how to get the batch size? and lhs: [b, n, m], rhs: [b, m, p]
+//output: [b, n, p] //n == s1, m == s2
 ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
                                      ParallelTensorShape const &lhs,
                                      ParallelTensorShape const &rhs) {

From 69c13bae6d85b71e9595152230718b361439db60 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 11 Oct 2023 15:37:00 +0000
Subject: [PATCH 31/69] add valid check for conv_2d

---
 lib/op-attrs/include/op-attrs/ops/conv_2d.h | 2 +-
 lib/op-attrs/src/batch_norm.cc              | 4 +++-
 lib/op-attrs/src/combine.cc                 | 2 +-
 lib/op-attrs/src/concat.cc                  | 3 +++
 lib/op-attrs/src/conv_2d.cc                 | 6 +++++-
 5 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d.h b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
index 31290d153e..51da1ac91c 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d.h
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
@@ -14,7 +14,7 @@ struct Conv2DAttrs {
       padding_w, groups;
   req<optional<Activation>> activation;
   req<bool> use_bias;
-  bool is_valid(TensorShape const &input) const;
+  bool is_valid(ParallelTensorShape const &input) const;
 };
 
 FF_VISITABLE_STRUCT(Conv2DAttrs,
diff --git a/lib/op-attrs/src/batch_norm.cc b/lib/op-attrs/src/batch_norm.cc
index 9b15913d1f..ed5da8286a 100644
--- a/lib/op-attrs/src/batch_norm.cc
+++ b/lib/op-attrs/src/batch_norm.cc
@@ -11,7 +11,9 @@ bool BatchNormAttrs::is_valid(ParallelTensorShape const &input) {
 
 ParallelTensorShape get_output_shape(BatchNormAttrs const &attrs,
                                      ParallelTensorShape const &input) {
-  return input;
+  ParallelTensorShape output_shape = input;
+  
+  return output_shape;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/combine.cc b/lib/op-attrs/src/combine.cc
index 7814442926..5309e5a620 100644
--- a/lib/op-attrs/src/combine.cc
+++ b/lib/op-attrs/src/combine.cc
@@ -12,7 +12,7 @@ bool CombineAttrs::is_valid(ParallelTensorShape const &input) const {
 
 ParallelTensorShape get_output_shape(CombineAttrs const &attrs,
                                      ParallelTensorShape const &input) {
-  ParallelTensorShape output = input_shape;
+  ParallelTensorShape output = input;
   output.at(attrs.combine_dim).degree /= attrs.combine_degree;
   return output;
 }
diff --git a/lib/op-attrs/src/concat.cc b/lib/op-attrs/src/concat.cc
index e4b9496e69..34e19cb423 100644
--- a/lib/op-attrs/src/concat.cc
+++ b/lib/op-attrs/src/concat.cc
@@ -7,6 +7,9 @@ bool ConcatAttrs::is_valid(
   bool valid = true;
   for (auto p : input) {
     valid &= p.is_valid();
+    if(axis >= p.num_dims(())){
+      return false;
+    }
   }
   return valid;
 }
diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index 42085ca2b1..facba4d661 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -81,10 +81,14 @@ std::vector<ParallelDimMappingRecord>
   return mappings;
 }
 
-bool Conv2DAttrs::is_valid(TensorShape const &input) const {
+bool Conv2DAttrs::is_valid(ParallelTensorShape const &input) const {
   if (!input.is_valid()) {
     return false;
   }
+  if(input.num_dims() != 4) {
+    return false;
+  }
+  
   return true;
 }
 

From b56d9c0c49a4daaafb0be09f7329ea56a7082a2f Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 11 Oct 2023 15:49:56 +0000
Subject: [PATCH 32/69] format the code and get_output_shape draft version0.1

---
 .../include/op-attrs/ops/element_binary.h     |  2 +-
 lib/op-attrs/src/batch_matmul.cc              | 12 ++++-----
 lib/op-attrs/src/batch_norm.cc                |  2 +-
 lib/op-attrs/src/concat.cc                    |  2 +-
 lib/op-attrs/src/conv_2d.cc                   |  4 +--
 lib/op-attrs/src/element_unary.cc             |  1 +
 lib/op-attrs/src/reverse.cc                   | 27 +++++++++----------
 7 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/element_binary.h b/lib/op-attrs/include/op-attrs/ops/element_binary.h
index c068fcc45c..7b731bf40f 100644
--- a/lib/op-attrs/include/op-attrs/ops/element_binary.h
+++ b/lib/op-attrs/include/op-attrs/ops/element_binary.h
@@ -2,9 +2,9 @@
 #define _FLEXFLOW_ELEMENT_BINARY_ATTRS_H
 
 #include "core.h"
-#include "op-attrs/datatype.h"
 #include "op-attrs/op.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/©"
 #include "utils/visitable.h"
 
 namespace FlexFlow {
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index e8842c7722..b460f844f6 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -10,18 +10,18 @@ bool BatchMatmulAttrs::is_valid(ParallelTensorShape const &lhs,
   if (!lhs.is_valid() || !rhs.is_valid()) {
     return false;
   }
-  
-  if(lhs.at(ff_dim_t(0)).size != rhs.at(ff_dim_t(0)).size) {
+
+  if (lhs.at(ff_dim_t(0)).size != rhs.at(ff_dim_t(0)).size) {
     return false;
   }
-  if(lhs.at(ff_dim_t(2)).size != rhs.at(ff_dim_t(1)).size) {
+  if (lhs.at(ff_dim_t(2)).size != rhs.at(ff_dim_t(1)).size) {
     return false;
   }
-  if(lhs.at(ff_dim_t(1)).size != a_seq_length_dim) {
+  if (lhs.at(ff_dim_t(1)).size != a_seq_length_dim) {
     return false;
   }
 
-  if(rhs.at(ff_dim_t(2)).size != b_seq_length_dim) {
+  if (rhs.at(ff_dim_t(2)).size != b_seq_length_dim) {
     return false;
   }
 
@@ -29,7 +29,7 @@ bool BatchMatmulAttrs::is_valid(ParallelTensorShape const &lhs,
 }
 
 // how to get the batch size? and lhs: [b, n, m], rhs: [b, m, p]
-//output: [b, n, p] //n == s1, m == s2
+// output: [b, n, p] //n == s1, m == s2
 ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
                                      ParallelTensorShape const &lhs,
                                      ParallelTensorShape const &rhs) {
diff --git a/lib/op-attrs/src/batch_norm.cc b/lib/op-attrs/src/batch_norm.cc
index ed5da8286a..526871fc46 100644
--- a/lib/op-attrs/src/batch_norm.cc
+++ b/lib/op-attrs/src/batch_norm.cc
@@ -12,7 +12,7 @@ bool BatchNormAttrs::is_valid(ParallelTensorShape const &input) {
 ParallelTensorShape get_output_shape(BatchNormAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   ParallelTensorShape output_shape = input;
-  
+
   return output_shape;
 }
 
diff --git a/lib/op-attrs/src/concat.cc b/lib/op-attrs/src/concat.cc
index 34e19cb423..39c06d07cc 100644
--- a/lib/op-attrs/src/concat.cc
+++ b/lib/op-attrs/src/concat.cc
@@ -7,7 +7,7 @@ bool ConcatAttrs::is_valid(
   bool valid = true;
   for (auto p : input) {
     valid &= p.is_valid();
-    if(axis >= p.num_dims(())){
+    if (axis >= p.num_dims(())) {
       return false;
     }
   }
diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index facba4d661..ed89b380df 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -85,10 +85,10 @@ bool Conv2DAttrs::is_valid(ParallelTensorShape const &input) const {
   if (!input.is_valid()) {
     return false;
   }
-  if(input.num_dims() != 4) {
+  if (input.num_dims() != 4) {
     return false;
   }
-  
+
   return true;
 }
 
diff --git a/lib/op-attrs/src/element_unary.cc b/lib/op-attrs/src/element_unary.cc
index 1fd11abe05..36e58ff263 100644
--- a/lib/op-attrs/src/element_unary.cc
+++ b/lib/op-attrs/src/element_unary.cc
@@ -13,5 +13,6 @@ ParallelTensorShape get_output_shape(ElementUnaryAttrs const &atts,
                                      ParallelTensorShape const &input) {
   ParallelTensorShape output = input;
   return output;
+}
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/reverse.cc b/lib/op-attrs/src/reverse.cc
index 0af3806778..a09d43ae61 100644
--- a/lib/op-attrs/src/reverse.cc
+++ b/lib/op-attrs/src/reverse.cc
@@ -3,21 +3,20 @@
 
 namespace FlexFlow {
 
-bool ReverseAttrs::is_valid(ParallelTensorShape const & input) const {
-    if(input.is_valid() ==false) {
-        return false;
-    }
-    if(this->axis < 0 || this->axis >= input.num_dims()) {
-        return false;
-    }
-    return true;
+bool ReverseAttrs::is_valid(ParallelTensorShape const &input) const {
+  if (input.is_valid() == false) {
+    return false;
+  }
+  if (this->axis < 0 || this->axis >= input.num_dims()) {
+    return false;
+  }
+  return true;
 }
 
-ParallelTensorShape get_output_shape(ReverseAttrs const & attrs, 
-                                     ParallelTensorShape const & input) {
-    ParallelTensorShape output = input;
-    return output;
+ParallelTensorShape get_output_shape(ReverseAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  ParallelTensorShape output = input;
+  return output;
 }
 
-
-};
\ No newline at end of file
+}; // namespace FlexFlow

From 1f8d85d2f2f37e7274465cf9f9f18caa1bead45d Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 00:36:17 +0000
Subject: [PATCH 33/69] leave attention  to implement

---
 lib/op-attrs/include/op-attrs/ops/attention.h | 38 +++++++++----------
 lib/op-attrs/src/attention.cc                 | 38 ++++++++++++++-----
 2 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h
index 670e4018cc..69636d1fa7 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention.h
+++ b/lib/op-attrs/include/op-attrs/ops/attention.h
@@ -7,6 +7,23 @@
 
 namespace FlexFlow {
 
+struct MultiHeadAttentionAttrs {
+  req<int> embed_dim, num_heads, kdim, vdim;
+  req<float> dropout;
+  req<bool> bias, add_bias_kv, add_zero_attn;
+};
+
+FF_VISITABLE_STRUCT(MultiHeadAttentionAttrs,
+                    embed_dim,
+                    num_heads,
+                    kdim,
+                    vdim,
+                    dropout,
+                    bias,
+                    add_bias_kv,
+                    add_zero_attn);
+
+
 template <typename TensorType>
 struct MultiHeadAttentionInputs
     : public use_visitable_cmp<MultiHeadAttentionInputs<TensorType>> {
@@ -28,23 +45,7 @@ struct MultiHeadAttentionInputs
   TensorType value;
 };
 
-struct MultiHeadAttentionAttrs {
-  req<int> embed_dim, num_heads, kdim, vdim;
-  req<float> dropout;
-  req<bool> bias, add_bias_kv, add_zero_attn;
-  bool is_valid(MultiHeadAttentionInputs<ParallelTensorShape> const &) const;
-};
-
-FF_VISITABLE_STRUCT(MultiHeadAttentionAttrs,
-                    embed_dim,
-                    num_heads,
-                    kdim,
-                    vdim,
-                    dropout,
-                    bias,
-                    add_bias_kv,
-                    add_zero_attn);
-CHECK_VALID_OP_ATTR(MultiHeadAttentionAttrs);
+bool is_valid(MultiHeadAttentionAttrs const &, MultiHeadAttentionInputs<ParallelTensorShape> const &input);
 
 int get_qProjSize(MultiHeadAttentionAttrs const &);
 int get_vProjSize(MultiHeadAttentionAttrs const &);
@@ -70,11 +71,10 @@ ParallelTensorShape
 ParallelTensorShape
     get_output_shape(MultiHeadAttentionAttrs const &,
                      MultiHeadAttentionInputs<ParallelTensorShape> const &);
-
 TensorShape get_output_shape(MultiHeadAttentionAttrs const &,
                              MultiHeadAttentionInputs<TensorShape> const &);
 
 CHECK_VALID_OP_ATTR(MultiHeadAttentionAttrs);
 } // namespace FlexFlow
 
-#endif
+#endif
\ No newline at end of file
diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index a6ae56ddfd..b212cdea8e 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -1,5 +1,7 @@
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "utils/exceptions.h"
+#include "kernels/legion_dim.h"
 
 namespace FlexFlow {
 
@@ -11,15 +13,6 @@ namespace FlexFlow {
 /*   return is_valid; */
 /* } */
 
-bool MultiHeadAttentionAttrs::is_valid(
-    MultiHeadAttentionInputs<ParallelTensorShape> const &input) const {
-  bool valid = true;
-  valid &= input.key.is_valid();
-  valid &= input.query.is_valid();
-  valid &= input.value.is_valid();
-  return valid;
-}
-
 int get_qProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.kdim;
 }
@@ -92,14 +85,39 @@ TensorShape
 //   return get_tensor_shape_unsafe(parallel_shape);
 // }
 
+//according to the pytorch  https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html,
+//query: [target_size_seq_len, batch_size, embed_dim], we consider the batch size 
+//key: (seq_len, batch_size, embed_dim)
+//value: (seq_len, batch_size, embed_dim)
+// multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+//output: (target_size_seq_len, batch_size, embed_dim)
+
 ParallelTensorShape get_output_shape(
     MultiHeadAttentionAttrs const &attrs,
     MultiHeadAttentionInputs<ParallelTensorShape> const &inputs) {
   ParallelTensorShape output_shape = inputs.query;
-  output_shape.at(ff_dim_t(output_shape.num_dims() - 1)).size = attrs.embed_dim;
+
   return output_shape;
 }
 
+bool is_valid(MultiHeadAttentionAttrs const & attrs, MultiHeadAttentionInputs<ParallelTensorShape> const &input) {
+  bool valid = true;
+  if(input.query.num_dims() != 3 || input.key.num_dims() != 3 || input.value.num_dims() != 3) {
+    return false;
+  }
+  //ff_dim_t = num_dims - legion_dim_t - 1 
+  if(input.query.at(legion_dim_t(0)).size != attrs.embed_dim) {
+    return false;
+  }
+  if(input.key.at(legion_dim_t(0)).size != attrs.embed_dim) {
+    return false;
+  }
+  if(input.value.at(legion_dim_t(0)).size != attrs.embed_dim) {
+    return false;
+  }
+  return true; 
+}
+
 } // namespace FlexFlow
 
 // Tensor FFModel::multihead_attention(const Tensor query,

From c9625645698f653344688a3c160d27d755f382cc Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 00:56:51 +0000
Subject: [PATCH 34/69] add batch_matmul

---
 .../include/op-attrs/ops/batch_matmul.h       |  4 +-
 lib/op-attrs/src/attention.cc                 |  4 +-
 lib/op-attrs/src/batch_matmul.cc              | 39 ++++++++++++++-----
 3 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
index f64b2fd8fb..1aa2fb7f59 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
@@ -9,12 +9,14 @@ namespace FlexFlow {
 
 struct BatchMatmulAttrs {
   req<int> a_seq_length_dim, b_seq_length_dim;
-  bool is_valid(ParallelTensorShape const &, ParallelTensorShape const &);
 };
 FF_VISITABLE_STRUCT(BatchMatmulAttrs, a_seq_length_dim, b_seq_length_dim);
 
 CHECK_VALID_OP_ATTR(BatchMatmulAttrs);
 
+bool is_valid(BatchMatmulAttrs const &, ParallelTensorShape const &, ParallelTensorShape const &);
+
+
 ParallelTensorShape get_output_shape(BatchMatmulAttrs const &,
                                      ParallelTensorShape const &,
                                      ParallelTensorShape const &);
diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index b212cdea8e..2ed20b3016 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -1,5 +1,6 @@
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "utils/exception.decl.h"
 #include "utils/exceptions.h"
 #include "kernels/legion_dim.h"
 
@@ -96,8 +97,7 @@ ParallelTensorShape get_output_shape(
     MultiHeadAttentionAttrs const &attrs,
     MultiHeadAttentionInputs<ParallelTensorShape> const &inputs) {
   ParallelTensorShape output_shape = inputs.query;
-
-  return output_shape;
+  NOT_IMPLEMENTED();
 }
 
 bool is_valid(MultiHeadAttentionAttrs const & attrs, MultiHeadAttentionInputs<ParallelTensorShape> const &input) {
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index b460f844f6..1f73da081d 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -5,23 +5,22 @@
 namespace FlexFlow {
 
 // maybe we should add more check here
-bool BatchMatmulAttrs::is_valid(ParallelTensorShape const &lhs,
+//// how to get the batch size? and lhs: [b, n, m], rhs: [b, m, p]
+// output: [b, n, p] //n == s1, m == s2
+//[n/]
+bool is_valid(BatchMatmulAttrs const & attrs, ParallelTensorShape const &lhs,
                                 ParallelTensorShape const &rhs) {
-  if (!lhs.is_valid() || !rhs.is_valid()) {
-    return false;
-  }
-
   if (lhs.at(ff_dim_t(0)).size != rhs.at(ff_dim_t(0)).size) {
     return false;
   }
   if (lhs.at(ff_dim_t(2)).size != rhs.at(ff_dim_t(1)).size) {
     return false;
   }
-  if (lhs.at(ff_dim_t(1)).size != a_seq_length_dim) {
+  if (lhs.at(ff_dim_t(1)).size != attrs.a_seq_length_dim) {
     return false;
   }
 
-  if (rhs.at(ff_dim_t(2)).size != b_seq_length_dim) {
+  if (rhs.at(ff_dim_t(2)).size != attrs.b_seq_length_dim) {
     return false;
   }
 
@@ -30,14 +29,34 @@ bool BatchMatmulAttrs::is_valid(ParallelTensorShape const &lhs,
 
 // how to get the batch size? and lhs: [b, n, m], rhs: [b, m, p]
 // output: [b, n, p] //n == s1, m == s2
+//[b, n/2, m], [b, m, p/2] -> [b, n/2, p/2]
+//[b, n, m/2], [b, m/2, p] -> [b, n, p/2]
 ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
                                      ParallelTensorShape const &lhs,
                                      ParallelTensorShape const &rhs) {
   ParallelTensorShape output_shape = lhs;
   output_shape.at(ff_dim_t(0)).size = lhs.at(ff_dim_t(0)).size;
-  output_shape.at(ff_dim_t(1)).size = attrs.a_seq_length_dim;
-  output_shape.at(ff_dim_t(2)).size = attrs.b_seq_length_dim;
-  // TODO: Do we need to set the ParallelDim for output_shape
+  //degree is 1
+  //[b, n, m], rhs: [b, m, p] -> [b, n, p]
+  if(lhs.at(ff_dim_t(1)).degree == 1 && rhs.at(ff_dim_t(2)).degree == 1) {
+    output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size;
+    output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size;
+    output_shape.at(ff_dim_t(0)).is_replica_dim= false;
+  } else if(lhs.at(ff_dim_t(1)).degree>1 && rhs.at(ff_dim_t(2)).degree == 1) { //[b, n/x, m], [b, m, p/x] => [b, n/x, p/x]
+    output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size/lhs.at(ff_dim_t(1)).degree;
+    output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size/rhs.at(ff_dim_t(2)).degree;
+    output_shape.at(ff_dim_t(0)).is_replica_dim= true;
+  } else if(lhs.at(ff_dim_t(1)).degree == 1 && rhs.at(ff_dim_t(2)).degree > 1) { //[b, n, m/x], [b, m/x, p] => [b, n, p/x]
+    output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size;
+    output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size/rhs.at(ff_dim_t(2)).degree;
+     output_shape.at(ff_dim_t(0)).is_replica_dim= true;
+  } else if(lhs.at(ff_dim_t(1)).degree > 1 && rhs.at(ff_dim_t(2)).degree > 1) { //[b, n/x, m/y], [b, m/y, p/z] => [b, n/x, p/z]
+    output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size/lhs.at(ff_dim_t(1)).degree;
+    output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size/rhs.at(ff_dim_t(2)).degree;
+     output_shape.at(ff_dim_t(0)).is_replica_dim= true;
+  } else {
+    assert(false && "not supported in BatchMatmulAttrs get_output_shape");
+  }
   return output_shape;
 }
 

From ab7efc8e3d0f71af54510471235db8b2c163c9b4 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 00:59:13 +0000
Subject: [PATCH 35/69] add batch_matmul

---
 lib/op-attrs/include/op-attrs/ops/attention.h |  6 +--
 .../include/op-attrs/ops/batch_matmul.h       |  5 ++-
 lib/op-attrs/src/attention.cc                 | 31 +++++++------
 lib/op-attrs/src/batch_matmul.cc              | 44 ++++++++++++-------
 4 files changed, 51 insertions(+), 35 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h
index 69636d1fa7..7d332ddc0e 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention.h
+++ b/lib/op-attrs/include/op-attrs/ops/attention.h
@@ -23,7 +23,6 @@ FF_VISITABLE_STRUCT(MultiHeadAttentionAttrs,
                     add_bias_kv,
                     add_zero_attn);
 
-
 template <typename TensorType>
 struct MultiHeadAttentionInputs
     : public use_visitable_cmp<MultiHeadAttentionInputs<TensorType>> {
@@ -45,7 +44,8 @@ struct MultiHeadAttentionInputs
   TensorType value;
 };
 
-bool is_valid(MultiHeadAttentionAttrs const &, MultiHeadAttentionInputs<ParallelTensorShape> const &input);
+bool is_valid(MultiHeadAttentionAttrs const &,
+              MultiHeadAttentionInputs<ParallelTensorShape> const &input);
 
 int get_qProjSize(MultiHeadAttentionAttrs const &);
 int get_vProjSize(MultiHeadAttentionAttrs const &);
@@ -77,4 +77,4 @@ TensorShape get_output_shape(MultiHeadAttentionAttrs const &,
 CHECK_VALID_OP_ATTR(MultiHeadAttentionAttrs);
 } // namespace FlexFlow
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
index 1aa2fb7f59..8b545b46f3 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
@@ -14,8 +14,9 @@ FF_VISITABLE_STRUCT(BatchMatmulAttrs, a_seq_length_dim, b_seq_length_dim);
 
 CHECK_VALID_OP_ATTR(BatchMatmulAttrs);
 
-bool is_valid(BatchMatmulAttrs const &, ParallelTensorShape const &, ParallelTensorShape const &);
-
+bool is_valid(BatchMatmulAttrs const &,
+              ParallelTensorShape const &,
+              ParallelTensorShape const &);
 
 ParallelTensorShape get_output_shape(BatchMatmulAttrs const &,
                                      ParallelTensorShape const &,
diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index 2ed20b3016..2d189d7472 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -1,8 +1,8 @@
 #include "op-attrs/ops/attention.h"
+#include "kernels/legion_dim.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "utils/exception.decl.h"
 #include "utils/exceptions.h"
-#include "kernels/legion_dim.h"
 
 namespace FlexFlow {
 
@@ -86,12 +86,13 @@ TensorShape
 //   return get_tensor_shape_unsafe(parallel_shape);
 // }
 
-//according to the pytorch  https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html,
-//query: [target_size_seq_len, batch_size, embed_dim], we consider the batch size 
-//key: (seq_len, batch_size, embed_dim)
-//value: (seq_len, batch_size, embed_dim)
-// multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
-//output: (target_size_seq_len, batch_size, embed_dim)
+// according to the pytorch
+// https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html,
+// query: [target_size_seq_len, batch_size, embed_dim], we consider the batch
+// size key: (seq_len, batch_size, embed_dim) value: (seq_len, batch_size,
+// embed_dim)
+//  multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+// output: (target_size_seq_len, batch_size, embed_dim)
 
 ParallelTensorShape get_output_shape(
     MultiHeadAttentionAttrs const &attrs,
@@ -100,22 +101,24 @@ ParallelTensorShape get_output_shape(
   NOT_IMPLEMENTED();
 }
 
-bool is_valid(MultiHeadAttentionAttrs const & attrs, MultiHeadAttentionInputs<ParallelTensorShape> const &input) {
+bool is_valid(MultiHeadAttentionAttrs const &attrs,
+              MultiHeadAttentionInputs<ParallelTensorShape> const &input) {
   bool valid = true;
-  if(input.query.num_dims() != 3 || input.key.num_dims() != 3 || input.value.num_dims() != 3) {
+  if (input.query.num_dims() != 3 || input.key.num_dims() != 3 ||
+      input.value.num_dims() != 3) {
     return false;
   }
-  //ff_dim_t = num_dims - legion_dim_t - 1 
-  if(input.query.at(legion_dim_t(0)).size != attrs.embed_dim) {
+  // ff_dim_t = num_dims - legion_dim_t - 1
+  if (input.query.at(legion_dim_t(0)).size != attrs.embed_dim) {
     return false;
   }
-  if(input.key.at(legion_dim_t(0)).size != attrs.embed_dim) {
+  if (input.key.at(legion_dim_t(0)).size != attrs.embed_dim) {
     return false;
   }
-  if(input.value.at(legion_dim_t(0)).size != attrs.embed_dim) {
+  if (input.value.at(legion_dim_t(0)).size != attrs.embed_dim) {
     return false;
   }
-  return true; 
+  return true;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index 1f73da081d..27aa81e003 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -8,8 +8,9 @@ namespace FlexFlow {
 //// how to get the batch size? and lhs: [b, n, m], rhs: [b, m, p]
 // output: [b, n, p] //n == s1, m == s2
 //[n/]
-bool is_valid(BatchMatmulAttrs const & attrs, ParallelTensorShape const &lhs,
-                                ParallelTensorShape const &rhs) {
+bool is_valid(BatchMatmulAttrs const &attrs,
+              ParallelTensorShape const &lhs,
+              ParallelTensorShape const &rhs) {
   if (lhs.at(ff_dim_t(0)).size != rhs.at(ff_dim_t(0)).size) {
     return false;
   }
@@ -36,24 +37,35 @@ ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
                                      ParallelTensorShape const &rhs) {
   ParallelTensorShape output_shape = lhs;
   output_shape.at(ff_dim_t(0)).size = lhs.at(ff_dim_t(0)).size;
-  //degree is 1
+  // degree is 1
   //[b, n, m], rhs: [b, m, p] -> [b, n, p]
-  if(lhs.at(ff_dim_t(1)).degree == 1 && rhs.at(ff_dim_t(2)).degree == 1) {
+  if (lhs.at(ff_dim_t(1)).degree == 1 && rhs.at(ff_dim_t(2)).degree == 1) {
     output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size;
     output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size;
-    output_shape.at(ff_dim_t(0)).is_replica_dim= false;
-  } else if(lhs.at(ff_dim_t(1)).degree>1 && rhs.at(ff_dim_t(2)).degree == 1) { //[b, n/x, m], [b, m, p/x] => [b, n/x, p/x]
-    output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size/lhs.at(ff_dim_t(1)).degree;
-    output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size/rhs.at(ff_dim_t(2)).degree;
-    output_shape.at(ff_dim_t(0)).is_replica_dim= true;
-  } else if(lhs.at(ff_dim_t(1)).degree == 1 && rhs.at(ff_dim_t(2)).degree > 1) { //[b, n, m/x], [b, m/x, p] => [b, n, p/x]
+    output_shape.at(ff_dim_t(0)).is_replica_dim = false;
+  } else if (lhs.at(ff_dim_t(1)).degree > 1 &&
+             rhs.at(ff_dim_t(2)).degree ==
+                 1) { //[b, n/x, m], [b, m, p/x] => [b, n/x, p/x]
+    output_shape.at(ff_dim_t(1)).size =
+        lhs.at(ff_dim_t(1)).size / lhs.at(ff_dim_t(1)).degree;
+    output_shape.at(ff_dim_t(2)).size =
+        rhs.at(ff_dim_t(2)).size / rhs.at(ff_dim_t(2)).degree;
+    output_shape.at(ff_dim_t(0)).is_replica_dim = true;
+  } else if (lhs.at(ff_dim_t(1)).degree == 1 &&
+             rhs.at(ff_dim_t(2)).degree >
+                 1) { //[b, n, m/x], [b, m/x, p] => [b, n, p/x]
     output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size;
-    output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size/rhs.at(ff_dim_t(2)).degree;
-     output_shape.at(ff_dim_t(0)).is_replica_dim= true;
-  } else if(lhs.at(ff_dim_t(1)).degree > 1 && rhs.at(ff_dim_t(2)).degree > 1) { //[b, n/x, m/y], [b, m/y, p/z] => [b, n/x, p/z]
-    output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size/lhs.at(ff_dim_t(1)).degree;
-    output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size/rhs.at(ff_dim_t(2)).degree;
-     output_shape.at(ff_dim_t(0)).is_replica_dim= true;
+    output_shape.at(ff_dim_t(2)).size =
+        rhs.at(ff_dim_t(2)).size / rhs.at(ff_dim_t(2)).degree;
+    output_shape.at(ff_dim_t(0)).is_replica_dim = true;
+  } else if (lhs.at(ff_dim_t(1)).degree > 1 &&
+             rhs.at(ff_dim_t(2)).degree >
+                 1) { //[b, n/x, m/y], [b, m/y, p/z] => [b, n/x, p/z]
+    output_shape.at(ff_dim_t(1)).size =
+        lhs.at(ff_dim_t(1)).size / lhs.at(ff_dim_t(1)).degree;
+    output_shape.at(ff_dim_t(2)).size =
+        rhs.at(ff_dim_t(2)).size / rhs.at(ff_dim_t(2)).degree;
+    output_shape.at(ff_dim_t(0)).is_replica_dim = true;
   } else {
     assert(false && "not supported in BatchMatmulAttrs get_output_shape");
   }

From 8c7395ddd38743ef2a6ccff8a2aaacd2a6dc510b Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 01:04:45 +0000
Subject: [PATCH 36/69] add batch norm

---
 lib/op-attrs/include/op-attrs/ops/batch_norm.h | 3 ++-
 lib/op-attrs/src/batch_norm.cc                 | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/batch_norm.h b/lib/op-attrs/include/op-attrs/ops/batch_norm.h
index c35d7bcd41..6f170c92f8 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_norm.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_norm.h
@@ -9,10 +9,11 @@ namespace FlexFlow {
 
 struct BatchNormAttrs {
   req<bool> relu;
-  bool is_valid(ParallelTensorShape const &);
 };
 FF_VISITABLE_STRUCT(BatchNormAttrs, relu);
 
+bool is_valid(BatchNormAttrs const &, ParallelTensorShape const &);
+
 ParallelTensorShape get_output_shape(BatchNormAttrs const &,
                                      ParallelTensorShape const &);
 
diff --git a/lib/op-attrs/src/batch_norm.cc b/lib/op-attrs/src/batch_norm.cc
index 526871fc46..752bf5e06c 100644
--- a/lib/op-attrs/src/batch_norm.cc
+++ b/lib/op-attrs/src/batch_norm.cc
@@ -2,17 +2,18 @@
 
 namespace FlexFlow {
 
-bool BatchNormAttrs::is_valid(ParallelTensorShape const &input) {
-  if (!input.is_valid()) {
+bool is_valid(BatchNormAttrs const &attrs, ParallelTensorShape const &input) {
+  if (input.num_dims() != 4) {
     return false;
   }
   return true;
 }
 
+// input: [b, c, h, w]
+// output: [b, c, h, w]
 ParallelTensorShape get_output_shape(BatchNormAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   ParallelTensorShape output_shape = input;
-
   return output_shape;
 }
 

From 1f1703c2dbbe91fc0f371a2858c2fe61cf436993 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 01:28:56 +0000
Subject: [PATCH 37/69] refine the batch_matmul

---
 lib/op-attrs/include/op-attrs/ops/cast.h    |  1 -
 lib/op-attrs/include/op-attrs/ops/combine.h |  1 -
 lib/op-attrs/src/batch_matmul.cc            | 66 +++++++++++++--------
 lib/op-attrs/src/batch_norm.cc              |  7 ---
 lib/op-attrs/src/cast.cc                    |  7 ---
 lib/op-attrs/src/combine.cc                 |  7 ---
 lib/op-attrs/src/concat.cc                  | 19 +++---
 7 files changed, 48 insertions(+), 60 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/cast.h b/lib/op-attrs/include/op-attrs/ops/cast.h
index 39d6fe1cc1..403fcc21a6 100644
--- a/lib/op-attrs/include/op-attrs/ops/cast.h
+++ b/lib/op-attrs/include/op-attrs/ops/cast.h
@@ -10,7 +10,6 @@ namespace FlexFlow {
 
 struct CastAttrs {
   req<DataType> dtype;
-  bool is_valid(ParallelTensorShape const &input) const;
 };
 FF_VISITABLE_STRUCT(CastAttrs, dtype);
 
diff --git a/lib/op-attrs/include/op-attrs/ops/combine.h b/lib/op-attrs/include/op-attrs/ops/combine.h
index ffc04d4656..49bea57a38 100644
--- a/lib/op-attrs/include/op-attrs/ops/combine.h
+++ b/lib/op-attrs/include/op-attrs/ops/combine.h
@@ -11,7 +11,6 @@ namespace FlexFlow {
 struct CombineAttrs {
   ff_dim_t combine_dim;
   req<int> combine_degree;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(CombineAttrs, combine_dim, combine_degree);
 CHECK_VALID_OP_ATTR(CombineAttrs);
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index 27aa81e003..209a064803 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -1,33 +1,11 @@
 #include "op-attrs/ops/batch_matmul.h"
 #include "op-attrs/ff_dim.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "utils/exception.decl.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-// maybe we should add more check here
-//// how to get the batch size? and lhs: [b, n, m], rhs: [b, m, p]
-// output: [b, n, p] //n == s1, m == s2
-//[n/]
-bool is_valid(BatchMatmulAttrs const &attrs,
-              ParallelTensorShape const &lhs,
-              ParallelTensorShape const &rhs) {
-  if (lhs.at(ff_dim_t(0)).size != rhs.at(ff_dim_t(0)).size) {
-    return false;
-  }
-  if (lhs.at(ff_dim_t(2)).size != rhs.at(ff_dim_t(1)).size) {
-    return false;
-  }
-  if (lhs.at(ff_dim_t(1)).size != attrs.a_seq_length_dim) {
-    return false;
-  }
-
-  if (rhs.at(ff_dim_t(2)).size != attrs.b_seq_length_dim) {
-    return false;
-  }
-
-  return true;
-}
-
 // how to get the batch size? and lhs: [b, n, m], rhs: [b, m, p]
 // output: [b, n, p] //n == s1, m == s2
 //[b, n/2, m], [b, m, p/2] -> [b, n/2, p/2]
@@ -36,16 +14,42 @@ ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
                                      ParallelTensorShape const &lhs,
                                      ParallelTensorShape const &rhs) {
   ParallelTensorShape output_shape = lhs;
+
+  // check if the input is valid
+  if (!lhs.is_valid() || !rhs.is_valid()) {
+    throw mk_runtime_error(
+        "BatchMatmulAttrs::get_output_shape: input is invalid")
+  }
+
+  if (lhs.at(ff_dim_t(0)).size != rhs.at(ff_dim_t(0)).size) {
+    throw mk_runtime_error(
+        "BatchMatmulAttrs::get_output_shape: batch size is not equal")
+  }
+
   output_shape.at(ff_dim_t(0)).size = lhs.at(ff_dim_t(0)).size;
   // degree is 1
   //[b, n, m], rhs: [b, m, p] -> [b, n, p]
   if (lhs.at(ff_dim_t(1)).degree == 1 && rhs.at(ff_dim_t(2)).degree == 1) {
+    // check if the input is valid
+    if (lhs.at(ff_dim_t(2)).size != rhs.at(ff_dim_t(1)).size ||
+        lhs.at(ff_dim_t(1)).size != attrs.a_seq_length_dim ||
+        rhs.at(ff_dim_t(2)).size != attrs.b_seq_length_dim) {
+      throw mk_runtime_error("BatchMatmulAttrs::get_output_shape: lhs and rhs "
+                             "are not match when degree is 1");
+    }
+
     output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size;
     output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size;
     output_shape.at(ff_dim_t(0)).is_replica_dim = false;
   } else if (lhs.at(ff_dim_t(1)).degree > 1 &&
              rhs.at(ff_dim_t(2)).degree ==
                  1) { //[b, n/x, m], [b, m, p/x] => [b, n/x, p/x]
+
+    if (lhs.at(ff_dim_t(1)).degree != rhs.at(ff_dim_t(2)).degree) {
+      throw mk_runtime_error("BatchMatmulAttrs::get_output_shape: lhs.degree "
+                             ">1 and rhs.degree == 1, but degree is not equal");
+    }
+
     output_shape.at(ff_dim_t(1)).size =
         lhs.at(ff_dim_t(1)).size / lhs.at(ff_dim_t(1)).degree;
     output_shape.at(ff_dim_t(2)).size =
@@ -54,6 +58,11 @@ ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
   } else if (lhs.at(ff_dim_t(1)).degree == 1 &&
              rhs.at(ff_dim_t(2)).degree >
                  1) { //[b, n, m/x], [b, m/x, p] => [b, n, p/x]
+    if (lhs.at(ff_dim_t(2)).degree != rhs.at(ff_dim_t(1)).degree) {
+      throw mk_runtime_error(
+          "BatchMatmulAttrs::get_output_shape: lhs.degree == 1 and rhs.degree "
+          "> 1, but degree is not equal");
+    }
     output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size;
     output_shape.at(ff_dim_t(2)).size =
         rhs.at(ff_dim_t(2)).size / rhs.at(ff_dim_t(2)).degree;
@@ -61,13 +70,20 @@ ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
   } else if (lhs.at(ff_dim_t(1)).degree > 1 &&
              rhs.at(ff_dim_t(2)).degree >
                  1) { //[b, n/x, m/y], [b, m/y, p/z] => [b, n/x, p/z]
+
+    if (lhs.at(ff_dim_t(1)).degree != rhs.at(ff_dim_t(2)).degree) {
+      throw mk_runtime_error("BatchMatmulAttrs::get_output_shape: lhs.degree > "
+                             "1 and rhs.degree > 1, but degree is not equal");
+    }
+
     output_shape.at(ff_dim_t(1)).size =
         lhs.at(ff_dim_t(1)).size / lhs.at(ff_dim_t(1)).degree;
     output_shape.at(ff_dim_t(2)).size =
         rhs.at(ff_dim_t(2)).size / rhs.at(ff_dim_t(2)).degree;
     output_shape.at(ff_dim_t(0)).is_replica_dim = true;
   } else {
-    assert(false && "not supported in BatchMatmulAttrs get_output_shape");
+    throw mk_runtime_error("BatchMatmulAttrs::get_output_shape: not supported "
+                           "in BatchMatmulAttrs get_output_shape");
   }
   return output_shape;
 }
diff --git a/lib/op-attrs/src/batch_norm.cc b/lib/op-attrs/src/batch_norm.cc
index 752bf5e06c..e787cf741f 100644
--- a/lib/op-attrs/src/batch_norm.cc
+++ b/lib/op-attrs/src/batch_norm.cc
@@ -2,13 +2,6 @@
 
 namespace FlexFlow {
 
-bool is_valid(BatchNormAttrs const &attrs, ParallelTensorShape const &input) {
-  if (input.num_dims() != 4) {
-    return false;
-  }
-  return true;
-}
-
 // input: [b, c, h, w]
 // output: [b, c, h, w]
 ParallelTensorShape get_output_shape(BatchNormAttrs const &attrs,
diff --git a/lib/op-attrs/src/cast.cc b/lib/op-attrs/src/cast.cc
index d3a6961a2e..a743d77f59 100644
--- a/lib/op-attrs/src/cast.cc
+++ b/lib/op-attrs/src/cast.cc
@@ -2,13 +2,6 @@
 
 namespace FlexFlow {
 
-bool CastAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  return true;
-}
-
 ParallelTensorShape get_output_shape(CastAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   ParallelTensorShape output = input;
diff --git a/lib/op-attrs/src/combine.cc b/lib/op-attrs/src/combine.cc
index 5309e5a620..48fc6c8720 100644
--- a/lib/op-attrs/src/combine.cc
+++ b/lib/op-attrs/src/combine.cc
@@ -3,13 +3,6 @@
 
 namespace FlexFlow {
 
-bool CombineAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  return true;
-}
-
 ParallelTensorShape get_output_shape(CombineAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   ParallelTensorShape output = input;
diff --git a/lib/op-attrs/src/concat.cc b/lib/op-attrs/src/concat.cc
index 39c06d07cc..2f65ac5623 100644
--- a/lib/op-attrs/src/concat.cc
+++ b/lib/op-attrs/src/concat.cc
@@ -1,23 +1,18 @@
 #include "op-attrs/ops/concat.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-bool ConcatAttrs::is_valid(
-    std::vector<ParallelTensorShape> const &input) const {
-  bool valid = true;
-  for (auto p : input) {
-    valid &= p.is_valid();
-    if (axis >= p.num_dims(())) {
-      return false;
-    }
-  }
-  return valid;
-}
-
 ParallelTensorShape
     get_output_shape(ConcatAttrs const &attrs,
                      std::vector<ParallelTensorShape> const &inputs) {
   ParallelTensorShape output = inputs[0];
+  for (auto &i : inputs) {
+    if (attrs.axis >= i.num_dims() || i.is_valid() == false) {
+      throw mk_runtime_error("ConcatAttrs::get_output_shape: axis is out of "
+                             "range or input is invalid");
+    }
+  }
   for (auto &i : inputs) {
     output.at(attrs.axis).size += i.at(attrs.axis).size;
   }

From 9406a0b310dd426ca3a0b874366dcae4c6a5799f Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 12:54:52 +0000
Subject: [PATCH 38/69] refine the batch_matmul

---
 .../include/op-attrs/ops/batch_matmul.h       |  4 -
 .../include/op-attrs/ops/batch_norm.h         |  2 -
 lib/op-attrs/include/op-attrs/ops/conv_2d.h   |  1 -
 lib/op-attrs/src/batch_matmul.cc              | 78 +++++++------------
 lib/op-attrs/src/batch_norm.cc                |  5 ++
 lib/op-attrs/src/cast.cc                      |  4 +
 lib/op-attrs/src/conv_2d.cc                   |  5 ++
 7 files changed, 40 insertions(+), 59 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
index 8b545b46f3..c9d81c98e4 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
@@ -14,10 +14,6 @@ FF_VISITABLE_STRUCT(BatchMatmulAttrs, a_seq_length_dim, b_seq_length_dim);
 
 CHECK_VALID_OP_ATTR(BatchMatmulAttrs);
 
-bool is_valid(BatchMatmulAttrs const &,
-              ParallelTensorShape const &,
-              ParallelTensorShape const &);
-
 ParallelTensorShape get_output_shape(BatchMatmulAttrs const &,
                                      ParallelTensorShape const &,
                                      ParallelTensorShape const &);
diff --git a/lib/op-attrs/include/op-attrs/ops/batch_norm.h b/lib/op-attrs/include/op-attrs/ops/batch_norm.h
index 6f170c92f8..29b76d96e9 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_norm.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_norm.h
@@ -12,8 +12,6 @@ struct BatchNormAttrs {
 };
 FF_VISITABLE_STRUCT(BatchNormAttrs, relu);
 
-bool is_valid(BatchNormAttrs const &, ParallelTensorShape const &);
-
 ParallelTensorShape get_output_shape(BatchNormAttrs const &,
                                      ParallelTensorShape const &);
 
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d.h b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
index 51da1ac91c..79233eb8fc 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d.h
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
@@ -14,7 +14,6 @@ struct Conv2DAttrs {
       padding_w, groups;
   req<optional<Activation>> activation;
   req<bool> use_bias;
-  bool is_valid(ParallelTensorShape const &input) const;
 };
 
 FF_VISITABLE_STRUCT(Conv2DAttrs,
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index 209a064803..c4ff074302 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -1,7 +1,6 @@
 #include "op-attrs/ops/batch_matmul.h"
 #include "op-attrs/ff_dim.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exception.decl.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -23,64 +22,39 @@ ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
 
   if (lhs.at(ff_dim_t(0)).size != rhs.at(ff_dim_t(0)).size) {
     throw mk_runtime_error(
-        "BatchMatmulAttrs::get_output_shape: batch size is not equal")
+        "BatchMatmulAttrs::get_output_shape: batch size is not equal");
   }
+  if (lhs.at(ff_dim_t(2)).size != rhs.at(ff_dim_t(1)).size ||
+      lhs.at(ff_dim_t(1)).size != attrs.a_seq_length_dim ||
+      rhs.at(ff_dim_t(2)).size != attrs.b_seq_length_dim) {
+    throw mk_runtime_error(
+        "BatchMatmulAttrs::get_output_shape: third demension of lhs and second "
+        "dementions of rhs are not match");
+  }
+  output_shape.at(ff_dim_t(0)).size = lhs.at(ff_dim_t(0)).size; // batch size
+  output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size;
+  output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size;
 
-  output_shape.at(ff_dim_t(0)).size = lhs.at(ff_dim_t(0)).size;
-  // degree is 1
-  //[b, n, m], rhs: [b, m, p] -> [b, n, p]
-  if (lhs.at(ff_dim_t(1)).degree == 1 && rhs.at(ff_dim_t(2)).degree == 1) {
-    // check if the input is valid
-    if (lhs.at(ff_dim_t(2)).size != rhs.at(ff_dim_t(1)).size ||
-        lhs.at(ff_dim_t(1)).size != attrs.a_seq_length_dim ||
-        rhs.at(ff_dim_t(2)).size != attrs.b_seq_length_dim) {
-      throw mk_runtime_error("BatchMatmulAttrs::get_output_shape: lhs and rhs "
-                             "are not match when degree is 1");
-    }
-
-    output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size;
-    output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size;
+  if (lhs.at(ff_dim_t(1)).degree == 1 && lhs.at(ff_dim_t(2)).degree == 1) {
+    // case 0: degree is 1, [b, n, m], rhs: [b, m, p] -> [b, n, p]
     output_shape.at(ff_dim_t(0)).is_replica_dim = false;
-  } else if (lhs.at(ff_dim_t(1)).degree > 1 &&
-             rhs.at(ff_dim_t(2)).degree ==
-                 1) { //[b, n/x, m], [b, m, p/x] => [b, n/x, p/x]
-
-    if (lhs.at(ff_dim_t(1)).degree != rhs.at(ff_dim_t(2)).degree) {
-      throw mk_runtime_error("BatchMatmulAttrs::get_output_shape: lhs.degree "
-                             ">1 and rhs.degree == 1, but degree is not equal");
-    }
-
-    output_shape.at(ff_dim_t(1)).size =
-        lhs.at(ff_dim_t(1)).size / lhs.at(ff_dim_t(1)).degree;
-    output_shape.at(ff_dim_t(2)).size =
-        rhs.at(ff_dim_t(2)).size / rhs.at(ff_dim_t(2)).degree;
-    output_shape.at(ff_dim_t(0)).is_replica_dim = true;
   } else if (lhs.at(ff_dim_t(1)).degree == 1 &&
-             rhs.at(ff_dim_t(2)).degree >
-                 1) { //[b, n, m/x], [b, m/x, p] => [b, n, p/x]
-    if (lhs.at(ff_dim_t(2)).degree != rhs.at(ff_dim_t(1)).degree) {
-      throw mk_runtime_error(
-          "BatchMatmulAttrs::get_output_shape: lhs.degree == 1 and rhs.degree "
-          "> 1, but degree is not equal");
-    }
-    output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size;
-    output_shape.at(ff_dim_t(2)).size =
-        rhs.at(ff_dim_t(2)).size / rhs.at(ff_dim_t(2)).degree;
+             lhs.at(ff_dim_t(2)).degree >
+                 1) { // case 1: [b, n, m/x], [b, m/x, p] => [b, n, y]
     output_shape.at(ff_dim_t(0)).is_replica_dim = true;
+    output_shape.at(ff_dim_t(1)).degree = lhs.at(ff_dim_t(1)).degree;
   } else if (lhs.at(ff_dim_t(1)).degree > 1 &&
-             rhs.at(ff_dim_t(2)).degree >
-                 1) { //[b, n/x, m/y], [b, m/y, p/z] => [b, n/x, p/z]
-
-    if (lhs.at(ff_dim_t(1)).degree != rhs.at(ff_dim_t(2)).degree) {
-      throw mk_runtime_error("BatchMatmulAttrs::get_output_shape: lhs.degree > "
-                             "1 and rhs.degree > 1, but degree is not equal");
-    }
-
-    output_shape.at(ff_dim_t(1)).size =
-        lhs.at(ff_dim_t(1)).size / lhs.at(ff_dim_t(1)).degree;
-    output_shape.at(ff_dim_t(2)).size =
-        rhs.at(ff_dim_t(2)).size / rhs.at(ff_dim_t(2)).degree;
+             lhs.at(ff_dim_t(2)).degree ==
+                 1) { // case 2: [b, n/x, m] [b m p/x] => [b n/x p/x]
+    output_shape.at(ff_dim_t(0)).is_replica_dim = true;
+    output_shape.at(ff_dim_t(1)).degree = rhs.at(ff_dim_t(1)).degree;
+    output_shape.at(ff_dim_t(2)).degree = rhs.at(ff_dim_t(2)).degree;
+  } else if (lhs.at(ff_dim_t(1)).degree > 1 &&
+             lhs.at(ff_dim_t(2)).degree >
+                 1) { // case 3: [b n/x m/y] [b m/y p/x]=> [b n/x p/x]
     output_shape.at(ff_dim_t(0)).is_replica_dim = true;
+    output_shape.at(ff_dim_t(1)).degree = lhs.at(ff_dim_t(1)).degree;
+    output_shape.at(ff_dim_t(2)).degree = rhs.at(ff_dim_t(2)).degree;
   } else {
     throw mk_runtime_error("BatchMatmulAttrs::get_output_shape: not supported "
                            "in BatchMatmulAttrs get_output_shape");
diff --git a/lib/op-attrs/src/batch_norm.cc b/lib/op-attrs/src/batch_norm.cc
index e787cf741f..be11ac0e13 100644
--- a/lib/op-attrs/src/batch_norm.cc
+++ b/lib/op-attrs/src/batch_norm.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/batch_norm.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -6,6 +7,10 @@ namespace FlexFlow {
 // output: [b, c, h, w]
 ParallelTensorShape get_output_shape(BatchNormAttrs const &attrs,
                                      ParallelTensorShape const &input) {
+  if (!input.is_valid() || input.num_dims() != 4) {
+    throw mk_runtime_error(
+        "BatchNormAttrs::get_output_shape: input is invalid");
+  }
   ParallelTensorShape output_shape = input;
   return output_shape;
 }
diff --git a/lib/op-attrs/src/cast.cc b/lib/op-attrs/src/cast.cc
index a743d77f59..7c679439ad 100644
--- a/lib/op-attrs/src/cast.cc
+++ b/lib/op-attrs/src/cast.cc
@@ -1,9 +1,13 @@
 #include "op-attrs/ops/cast.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
 ParallelTensorShape get_output_shape(CastAttrs const &attrs,
                                      ParallelTensorShape const &input) {
+  if (!input.is_valid()) {
+    throw mk_runtime_error("CastAttrs::get_output_shape: input is invalid");
+  }
   ParallelTensorShape output = input;
   output.data_type = attrs.dtype;
   return output;
diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index ed89b380df..ab541cbe94 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -1,6 +1,7 @@
 #include "op-attrs/ops/conv_2d.h"
 #include "parallel_dim_mapping_record.h"
 #include "parallel_dim_mapping_record_solver.h"
+#include "utils/exception.h"
 #include "utils/vector.h"
 
 namespace FlexFlow {
@@ -101,6 +102,10 @@ bool Conv2DAttrs::is_valid(ParallelTensorShape const &input) const {
 ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   ParallelTensorShape output = input;
+  if (input.num_dims() != 4) {
+    throw mk_runtime_error("Conv2DAttrs::get_output_shape: input is invalid");
+  }
+
   output.at(ff_dim_t(1)).size = attrs.out_channels;
   output.at(ff_dim_t(2)).size =
       (input.at(ff_dim_t(2)).size + 2 * attrs.padding_h - attrs.kernel_h) /

From 9a84d50055b04f17a24a15011a2c93a99f8676e2 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 13:02:51 +0000
Subject: [PATCH 39/69] refine the conv2d

---
 lib/op-attrs/src/conv_2d.cc | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index ab541cbe94..867b4d1540 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/conv_2d.h"
+#include "op-attrs/ff_dim.h"
 #include "parallel_dim_mapping_record.h"
 #include "parallel_dim_mapping_record_solver.h"
 #include "utils/exception.h"
@@ -106,6 +107,12 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
     throw mk_runtime_error("Conv2DAttrs::get_output_shape: input is invalid");
   }
 
+  if (attrs.kernel_h > input.at(ff_dim_t(2)).size ||
+      attrs.kernel_w > input.at(ff_dim_t(3)).size) {
+    throw mk_runtime_error(
+        "Conv2DAttrs::get_output_shape: kernel size is larger than input size");
+  }
+
   output.at(ff_dim_t(1)).size = attrs.out_channels;
   output.at(ff_dim_t(2)).size =
       (input.at(ff_dim_t(2)).size + 2 * attrs.padding_h - attrs.kernel_h) /
@@ -115,6 +122,28 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
       (input.at(ff_dim_t(3)).size + 2 * attrs.padding_w - attrs.kernel_w) /
           attrs.stride_w +
       1;
+  if (input.at(ff_dim_t(2)).size == 1 && input.at(ff_dim_t(3)).size == 1) {
+    // case 1  input degree is 1, like 1GPU
+    output.at(ff_dim_t(0)).is_replica_dim = false;
+  } else if (input.at(ff_dim_t(2)).size > 1 &&
+             input.at(ff_dim_t(3)).size == 1) {
+    // case 2: [b, input_channel, input_h/x, input_w], [output_channel,
+    // input_channel, kernel_h, kernel_w] => [b, output_channel, output_h/x,
+    // output_w]
+    output.at(ff_dim_t(0)).is_replica_dim = true;
+    output.at(ff_dim_t(2)).degree = input.at(ff_dim_t(2)).degree;
+    output.at(ff_dim_t(3)).degree = input.at(ff_dim_t(3)).degree;
+  } else if (input.at(ff_dim_t(2)).size == 1 &&
+             input.at(ff_dim_t(3)).size > 1) {
+    // case 3: [b, input_channel, input_h, input_w / x] [output_channel,
+    // input_channel, kernel_h, kernel_w / x] => [b, output_channel, output_h,
+    // output_w / x]
+    output.at(ff_dim_t(0)).is_replica_dim = true;
+    output.at(ff_dim_t(3)).degree = input.at(ff_dim_t(3)).degree;
+  } else {
+    throw mk_runtime_error("Conv2DAttrs::get_output_shape: not supported in "
+                           "Conv2DAttrs get_output_shape");
+  }
   return output;
 }
 

From 3208f5b1fb16526dd6132788381ca389a3f1baf0 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 13:04:59 +0000
Subject: [PATCH 40/69] delete the invalid

---
 lib/op-attrs/include/op-attrs/ops/dropout.h        |  1 -
 lib/op-attrs/include/op-attrs/ops/element_binary.h |  2 --
 lib/op-attrs/src/conv_2d.cc                        | 11 -----------
 lib/op-attrs/src/dropout.cc                        |  7 -------
 lib/op-attrs/src/element_binary.cc                 |  7 -------
 5 files changed, 28 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/dropout.h b/lib/op-attrs/include/op-attrs/ops/dropout.h
index 04f244f27f..edf6db9ea8 100644
--- a/lib/op-attrs/include/op-attrs/ops/dropout.h
+++ b/lib/op-attrs/include/op-attrs/ops/dropout.h
@@ -10,7 +10,6 @@ namespace FlexFlow {
 struct DropoutAttrs {
   req<float> rate;
   req<unsigned long long> seed;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(DropoutAttrs, rate, seed);
 CHECK_VALID_OP_ATTR(DropoutAttrs);
diff --git a/lib/op-attrs/include/op-attrs/ops/element_binary.h b/lib/op-attrs/include/op-attrs/ops/element_binary.h
index 7b731bf40f..9a2e4dc22a 100644
--- a/lib/op-attrs/include/op-attrs/ops/element_binary.h
+++ b/lib/op-attrs/include/op-attrs/ops/element_binary.h
@@ -14,8 +14,6 @@ struct ElementBinaryAttrs {
   req<DataType> compute_type;
   req<bool> should_broadcast_lhs;
   req<bool> should_broadcast_rhs;
-  bool is_valid(ParallelTensorShape const &lhs,
-                ParallelTensorShape const &rhs) const;
 };
 FF_VISITABLE_STRUCT(ElementBinaryAttrs,
                     type,
diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index 867b4d1540..08e8315952 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -83,17 +83,6 @@ std::vector<ParallelDimMappingRecord>
   return mappings;
 }
 
-bool Conv2DAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  if (input.num_dims() != 4) {
-    return false;
-  }
-
-  return true;
-}
-
 // according to pytorch, the input shape: [b, input_channel, input_h, input_w]
 // kernel shape: [output_channel, input_channel, kernel_h, kernel_w]
 // we may have stide_h and padding_h
diff --git a/lib/op-attrs/src/dropout.cc b/lib/op-attrs/src/dropout.cc
index bccfdb10a2..ab763b8a7f 100644
--- a/lib/op-attrs/src/dropout.cc
+++ b/lib/op-attrs/src/dropout.cc
@@ -3,13 +3,6 @@
 
 namespace FlexFlow {
 
-bool DropoutAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  return true;
-}
-
 ParallelTensorShape get_output_shape(DropoutAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   ParallelTensorShape output = input;
diff --git a/lib/op-attrs/src/element_binary.cc b/lib/op-attrs/src/element_binary.cc
index 4b20ee25a9..f591d88719 100644
--- a/lib/op-attrs/src/element_binary.cc
+++ b/lib/op-attrs/src/element_binary.cc
@@ -2,13 +2,6 @@
 
 namespace FlexFlow {
 
-bool ElementBinaryAttrs::is_valid(ParallelTensorShape const &input1,
-                                  ParallelTensorShape const &input2) const {
-  if (!input1.is_valid() || !input2.is_valid()) {
-    return false;
-  }
-  return true;
-}
 
 ParallelTensorShape get_output_shape(ElementBinaryAttrs const &atts,
                                      ParallelTensorShape const &lhs,

From 8a6b29ea07309167d2a3e13f58b1b8e3515bd0ee Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 14:24:24 +0000
Subject: [PATCH 41/69] add gather

---
 .../include/op-attrs/ops/element_unary.h      |  1 -
 lib/op-attrs/include/op-attrs/ops/embedding.h |  1 -
 lib/op-attrs/include/op-attrs/ops/gather.h    |  1 -
 lib/op-attrs/src/element_binary.cc            |  1 -
 lib/op-attrs/src/element_unary.cc             |  7 ------
 lib/op-attrs/src/embedding.cc                 |  9 ++------
 lib/op-attrs/src/gather.cc                    | 23 ++++++++++++++++---
 7 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/element_unary.h b/lib/op-attrs/include/op-attrs/ops/element_unary.h
index 562c50e4ed..d0dbc3661c 100644
--- a/lib/op-attrs/include/op-attrs/ops/element_unary.h
+++ b/lib/op-attrs/include/op-attrs/ops/element_unary.h
@@ -18,7 +18,6 @@ CHECK_VALID_OP_ATTR(ElementScalarUnaryAttrs);
 
 struct ElementUnaryAttrs {
   req<Op> op;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ElementUnaryAttrs, op);
 CHECK_VALID_OP_ATTR(ElementUnaryAttrs);
diff --git a/lib/op-attrs/include/op-attrs/ops/embedding.h b/lib/op-attrs/include/op-attrs/ops/embedding.h
index 506b8a6186..52d22fe836 100644
--- a/lib/op-attrs/include/op-attrs/ops/embedding.h
+++ b/lib/op-attrs/include/op-attrs/ops/embedding.h
@@ -19,7 +19,6 @@ struct EmbeddingAttrs {
   req<int> num_entries, out_channels;
   req<AggregateOp> aggr;
   req<DataType> data_type;
-  bool is_valid(ParallelTensorShape const &input) const;
 };
 FF_VISITABLE_STRUCT(EmbeddingAttrs, num_entries, out_channels, aggr, data_type);
 CHECK_VALID_OP_ATTR(EmbeddingAttrs);
diff --git a/lib/op-attrs/include/op-attrs/ops/gather.h b/lib/op-attrs/include/op-attrs/ops/gather.h
index 1789edf649..852dc9cd5e 100644
--- a/lib/op-attrs/include/op-attrs/ops/gather.h
+++ b/lib/op-attrs/include/op-attrs/ops/gather.h
@@ -10,7 +10,6 @@ namespace FlexFlow {
 
 struct GatherAttrs {
   ff_dim_t dim;
-  bool is_valid(ParallelTensorShape const &, ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(GatherAttrs, dim);
 CHECK_VALID_OP_ATTR(GatherAttrs);
diff --git a/lib/op-attrs/src/element_binary.cc b/lib/op-attrs/src/element_binary.cc
index f591d88719..c61be195c0 100644
--- a/lib/op-attrs/src/element_binary.cc
+++ b/lib/op-attrs/src/element_binary.cc
@@ -2,7 +2,6 @@
 
 namespace FlexFlow {
 
-
 ParallelTensorShape get_output_shape(ElementBinaryAttrs const &atts,
                                      ParallelTensorShape const &lhs,
                                      ParallelTensorShape const &rhs) {
diff --git a/lib/op-attrs/src/element_unary.cc b/lib/op-attrs/src/element_unary.cc
index 36e58ff263..b9028ac3b8 100644
--- a/lib/op-attrs/src/element_unary.cc
+++ b/lib/op-attrs/src/element_unary.cc
@@ -2,13 +2,6 @@
 
 namespace FlexFlow {
 
-bool ElementUnaryAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  return true;
-}
-
 ParallelTensorShape get_output_shape(ElementUnaryAttrs const &atts,
                                      ParallelTensorShape const &input) {
   ParallelTensorShape output = input;
diff --git a/lib/op-attrs/src/embedding.cc b/lib/op-attrs/src/embedding.cc
index dca6e393ef..6598146092 100644
--- a/lib/op-attrs/src/embedding.cc
+++ b/lib/op-attrs/src/embedding.cc
@@ -2,13 +2,6 @@
 
 namespace FlexFlow {
 
-bool EmbeddingAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  return true;
-}
-
 // pytorch nn.Embedding
 // Embedding OP: (num_embeddings, embedding_dim) (num_entries, out_channels)
 // Input: (batch_size, seq_len)
@@ -20,3 +13,5 @@ ParallelTensorShape get_output_shape(EmbeddingAttrs const &atts,
   output.at(ff_dim_t(2)).size = atts.out_channels;
   return output;
 } // namespace FlexFlow
+
+}
\ No newline at end of file
diff --git a/lib/op-attrs/src/gather.cc b/lib/op-attrs/src/gather.cc
index 25bfe8e516..5cdca14955 100644
--- a/lib/op-attrs/src/gather.cc
+++ b/lib/op-attrs/src/gather.cc
@@ -1,5 +1,6 @@
 #include "op-attrs/ops/gather.h"
 #include "utils/exception.decl.h"
+#include "utils/exceptions.h"
 
 namespace FlexFlow {
 
@@ -17,12 +18,28 @@ bool GatherAttrs::is_valid(ParallelTensorShape const &lhs,
   return true;
 }
 
+//https://pytorch.org/docs/stable/generated/torch.gather.html
 // todo: why return a vector?
 std::vector<ParallelTensorShape>
     get_output_shapes(GatherAttrs const &attrs,
-                      ParallelTensorShape const &lhs,
-                      ParallelTensorShape const &rhs) {
-  NOT_IMPLEMENTED();
+                      ParallelTensorShape const & input,
+                      ParallelTensorShape const &index) {
+  if(input.num_dims() != index.num_dims()) {
+    throw mk_runtime_error("Gather: input and index must have the same number of dimensions");
+  }
+
+  for(int i = 0; i < input.num_dims(); i++) {
+    if(i != attrs.dim && input.at(ff_dim_t(i)).size <= index.at(ff_dim_t(i)).size) {
+      throw mk_runtime_error("Gather: index.size(d) <= input.size(d) for all dimensions d != dim");
+    }
+  }
+
+  ParallelTensorShape output = input;
+
+  std::vector<ParallelTensorShape> results;
+  //NOTE(lambda):why return a vector?
+  results.push_back(output);
+  return results;
 }
 
 /* bool GatherAttrs::is_valid(ParallelTensorShape const &lhs,

From a8c75ece898844407e56113af36d25daf4a08594 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 14:47:50 +0000
Subject: [PATCH 42/69] add groupy

---
 lib/op-attrs/include/op-attrs/ops/groupby.h   |  1 -
 .../include/op-attrs/ops/layer_norm.h         |  1 -
 lib/op-attrs/src/embedding.cc                 |  3 +-
 lib/op-attrs/src/gather.cc                    | 21 +++++++-----
 lib/op-attrs/src/groupby.cc                   | 34 +++++++++++++------
 5 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/groupby.h b/lib/op-attrs/include/op-attrs/ops/groupby.h
index 702cbd2a1c..d2c1033b31 100644
--- a/lib/op-attrs/include/op-attrs/ops/groupby.h
+++ b/lib/op-attrs/include/op-attrs/ops/groupby.h
@@ -10,7 +10,6 @@ namespace FlexFlow {
 struct Group_byAttrs {
   req<int> n;
   req<float> alpha;
-  bool is_valid(ParallelTensorShape const &, ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(Group_byAttrs, n, alpha);
 CHECK_VALID_OP_ATTR(Group_byAttrs);
diff --git a/lib/op-attrs/include/op-attrs/ops/layer_norm.h b/lib/op-attrs/include/op-attrs/ops/layer_norm.h
index 15b6729262..f279b0650c 100644
--- a/lib/op-attrs/include/op-attrs/ops/layer_norm.h
+++ b/lib/op-attrs/include/op-attrs/ops/layer_norm.h
@@ -12,7 +12,6 @@ struct LayerNormAttrs {
   stack_vector<ff_dim_t, MAX_TENSOR_DIM> axes;
   req<bool> elementwise_affine;
   req<float> eps;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(LayerNormAttrs, axes, elementwise_affine, eps);
 CHECK_VALID_OP_ATTR(LayerNormAttrs);
diff --git a/lib/op-attrs/src/embedding.cc b/lib/op-attrs/src/embedding.cc
index 6598146092..fa8f313457 100644
--- a/lib/op-attrs/src/embedding.cc
+++ b/lib/op-attrs/src/embedding.cc
@@ -11,7 +11,8 @@ ParallelTensorShape get_output_shape(EmbeddingAttrs const &atts,
   ParallelTensorShape output = input;
   output.at(ff_dim_t(1)).size = input.at(ff_dim_t(1)).size;
   output.at(ff_dim_t(2)).size = atts.out_channels;
+  // output degree is same as input degree
   return output;
 } // namespace FlexFlow
 
-}
\ No newline at end of file
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/gather.cc b/lib/op-attrs/src/gather.cc
index 5cdca14955..ed7e5abd7b 100644
--- a/lib/op-attrs/src/gather.cc
+++ b/lib/op-attrs/src/gather.cc
@@ -18,26 +18,29 @@ bool GatherAttrs::is_valid(ParallelTensorShape const &lhs,
   return true;
 }
 
-//https://pytorch.org/docs/stable/generated/torch.gather.html
-// todo: why return a vector?
+// https://pytorch.org/docs/stable/generated/torch.gather.html
+//  todo: why return a vector?
 std::vector<ParallelTensorShape>
     get_output_shapes(GatherAttrs const &attrs,
-                      ParallelTensorShape const & input,
+                      ParallelTensorShape const &input,
                       ParallelTensorShape const &index) {
-  if(input.num_dims() != index.num_dims()) {
-    throw mk_runtime_error("Gather: input and index must have the same number of dimensions");
+  if (input.num_dims() != index.num_dims()) {
+    throw mk_runtime_error(
+        "Gather: input and index must have the same number of dimensions");
   }
 
-  for(int i = 0; i < input.num_dims(); i++) {
-    if(i != attrs.dim && input.at(ff_dim_t(i)).size <= index.at(ff_dim_t(i)).size) {
-      throw mk_runtime_error("Gather: index.size(d) <= input.size(d) for all dimensions d != dim");
+  for (int i = 0; i < input.num_dims(); i++) {
+    if (i != attrs.dim &&
+        input.at(ff_dim_t(i)).size <= index.at(ff_dim_t(i)).size) {
+      throw mk_runtime_error(
+          "Gather: index.size(d) <= input.size(d) for all dimensions d != dim");
     }
   }
 
   ParallelTensorShape output = input;
 
   std::vector<ParallelTensorShape> results;
-  //NOTE(lambda):why return a vector?
+  // NOTE(lambda):why return a vector?
   results.push_back(output);
   return results;
 }
diff --git a/lib/op-attrs/src/groupby.cc b/lib/op-attrs/src/groupby.cc
index 9315b85c39..acae02d584 100644
--- a/lib/op-attrs/src/groupby.cc
+++ b/lib/op-attrs/src/groupby.cc
@@ -3,18 +3,32 @@
 
 namespace FlexFlow {
 
-bool Group_byAttrs::is_valid(ParallelTensorShape const &lhs,
-                             ParallelTensorShape const &rhs) const {
-  if (!lhs.is_valid() || !rhs.is_valid()) {
-    return false;
-  }
-  NOT_IMPLEMENTED();
-}
+/*
+import torch
+data = torch.tensor([10, 20, 30, 40, 50, 60, 70, 80])
+# group index tensor group_indices
+group_indices = torch.tensor([0, 1, 0, 2, 1, 2, 0, 1])
+
+# groupby operator
+unique_indices, unique_inverse_indices = torch.unique(group_indices,
+return_inverse=True) print(f"unique_indices: {unique_indices} and
+unique_inverse_indices: {unique_inverse_indices}") grouped_data = [] for i in
+unique_indices: # use unique_inverse_indices group_data =
+data[unique_inverse_indices == i] grouped_data.append(group_data) for i, group
+in enumerate(grouped_data): print(f"Group {i}: {group}")
+*/
 
 ParallelTensorShape get_output_shape(Group_byAttrs const &attrs,
-                                     ParallelTensorShape const &lhs,
-                                     ParallelTensorShape const &rhs) {
-  NOT_IMPLEMENTED();
+                                     ParallelTensorShape const &input,
+                                     ParallelTensorShape const &index) {
+  if (input.num_dims() != index.num_dims()) {
+    throw mk_runtime_error(
+        "Group_by: input and index must have the same number of dimensions");
+  }
+
+  ParallelTensorShape output = input;
+  // degree of output is same as input's
+  return output;
 }
 
 } // namespace FlexFlow

From bb615bd5c37ce04789766b29f01e6240c45721c2 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 14:49:16 +0000
Subject: [PATCH 43/69] implement the layer_norm

---
 lib/op-attrs/src/groupby.cc    |  2 +-
 lib/op-attrs/src/layer_norm.cc | 15 +++++----------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/lib/op-attrs/src/groupby.cc b/lib/op-attrs/src/groupby.cc
index acae02d584..09babdb20d 100644
--- a/lib/op-attrs/src/groupby.cc
+++ b/lib/op-attrs/src/groupby.cc
@@ -1,5 +1,5 @@
 #include "op-attrs/ops/groupby.h"
-#include "utils/exception.decl.h"
+#include "utils/exceptions.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/src/layer_norm.cc b/lib/op-attrs/src/layer_norm.cc
index 081252847a..58160b528f 100644
--- a/lib/op-attrs/src/layer_norm.cc
+++ b/lib/op-attrs/src/layer_norm.cc
@@ -1,21 +1,16 @@
 #include "op-attrs/ops/layer_norm.h"
+#include "utils/exceptions.h"
 
 namespace FlexFlow {
 
-bool LayerNormAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  if (input.num_dims() < 2) {
-    return false;
-  }
-  return true;
-}
-
 // todo: maybe we need to set the degree of parallel_dim
 ParallelTensorShape get_output_shape(LayerNormAttrs const &attrs,
                                      ParallelTensorShape const &input) {
+  if (input.num_dims() < 2) {
+    throw mk_runtime_error("LayerNorm: input must have at least 2 dimensions");
+  }
   ParallelTensorShape output = input;
+  // output degree is same as input degree
   return output;
 }
 

From bc823f4506d955c6b3183cd3695dacf7dc808982 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 15:01:32 +0000
Subject: [PATCH 44/69] add linear

---
 lib/op-attrs/include/op-attrs/ops/linear.h |  1 -
 lib/op-attrs/src/linear.cc                 | 43 ++++++++++++++++++----
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/linear.h b/lib/op-attrs/include/op-attrs/ops/linear.h
index 54a3864e8d..e696bb9fd0 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear.h
+++ b/lib/op-attrs/include/op-attrs/ops/linear.h
@@ -29,7 +29,6 @@ struct LinearAttrs {
   req<DataType> data_type;
   req<Activation> activation;
   req<optional<RegularizerAttrs>> regularizer;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(
     LinearAttrs, out_channels, use_bias, data_type, activation, regularizer);
diff --git a/lib/op-attrs/src/linear.cc b/lib/op-attrs/src/linear.cc
index bae30a8ebd..ef7832773a 100644
--- a/lib/op-attrs/src/linear.cc
+++ b/lib/op-attrs/src/linear.cc
@@ -1,23 +1,50 @@
 #include "op-attrs/ops/linear.h"
 #include "op-attrs/ff_dim.h"
+#include "utils/exception.decl.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-bool LinearAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  return true;
-}
-
+//https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
+//torch.nn.Linear(in_features, out_features, bias=True, device=None, dtype=None)
 // pytorch: input shape:{batch_size, input_channels}
 // pytorch linearattrs: should be {input_channels, output_channels}
 // pytorch: output shape:{batch_size, output_channels}
 // question: the Linearattrs doesn't have input_channels
 ParallelTensorShape get_output_shape(LinearAttrs const &atts,
                                      ParallelTensorShape const &input) {
+                          
   ParallelTensorShape out_shape = input;
-  out_shape.at(ff_dim_t(0)).size = atts.out_channels;
+  if(input.num_dims() != 2) {
+    throw mk_runtime_error("LinearAttrs: input shape should be 2D");
+  }
+
+  out_shape.at(ff_dim_t(1)).size = atts.out_channels;
+  //linear shoud consider the degree
+  //case 1: input:[N, K], weight:[K, M], degree is 1
+  if(input.at(ff_dim_t(0)).degree == 1 && input.at(ff_dim_t(1)).degree == 1  ) {
+    out_shape.at(ff_dim_t(0)).degree = 1;
+    for(int i = 0; i < input.num_dims(); i++) {
+      out_shape.at(ff_dim_t(i)).is_replica_dim = false;
+      out_shape.at(ff_dim_t(i)).degree = 1;
+    }
+  } else if(input.at(ff_dim_t(0)).degree == 1 && input.at(ff_dim_t(1)).degree > 1) {
+    //case 2: input [N, k/x], weight [k/x, M], output [N, M], degree is x
+    out_shape.at(ff_dim_t(1)).degree = input.at(ff_dim_t(1)).degree;
+    out_shape.at(ff_dim_t(1)).is_replica_dim = true;
+  } else if(input.at(ff_dim_t(0)).degree > 1 && input.at(ff_dim_t(1)).degree == 1)  {
+    //case 3: input [N/X, K], weight [K, M/X], output [N/X, M], degree is X  
+    out_shape.at(ff_dim_t(0)).degree = input.at(ff_dim_t(0)).degree;
+    out_shape.at(ff_dim_t(0)).is_replica_dim = true;
+  } else if(input.at(ff_dim_t(0)).degree > 1 && input.at(ff_dim_t(1)).degree > 1) {
+    //case 4: input [N/X, K/Y], weight [K/Y, M/X], output [N/X, M/X], degree is X
+    for(int i = 0; i < input.num_dims(); i++) {
+      out_shape.at(ff_dim_t(i)).is_replica_dim = true;
+      out_shape.at(ff_dim_t(i)).degree = input.at(ff_dim_t(i)).degree;
+    }
+  } else {
+    throw mk_runtime_error("LinearAttrs: degree is not supported");
+  }
   return out_shape;
 }
 

From f2a50e39b65a4445b21674d6fc06673b33e840f6 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 15:14:27 +0000
Subject: [PATCH 45/69] add pool2d

---
 lib/op-attrs/src/linear.cc  | 44 ++++++++++++++-------------
 lib/op-attrs/src/pool_2d.cc | 59 ++++++++++++++++++++++++++++++++-----
 2 files changed, 75 insertions(+), 28 deletions(-)

diff --git a/lib/op-attrs/src/linear.cc b/lib/op-attrs/src/linear.cc
index ef7832773a..3bb8e0f3ae 100644
--- a/lib/op-attrs/src/linear.cc
+++ b/lib/op-attrs/src/linear.cc
@@ -1,44 +1,48 @@
 #include "op-attrs/ops/linear.h"
 #include "op-attrs/ff_dim.h"
-#include "utils/exception.decl.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
 
-//https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
-//torch.nn.Linear(in_features, out_features, bias=True, device=None, dtype=None)
-// pytorch: input shape:{batch_size, input_channels}
-// pytorch linearattrs: should be {input_channels, output_channels}
-// pytorch: output shape:{batch_size, output_channels}
-// question: the Linearattrs doesn't have input_channels
+// https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
+// torch.nn.Linear(in_features, out_features, bias=True, device=None,
+// dtype=None)
+//  pytorch: input shape:{batch_size, input_channels}
+//  pytorch linearattrs: should be {input_channels, output_channels}
+//  pytorch: output shape:{batch_size, output_channels}
+//  question: the Linearattrs doesn't have input_channels
 ParallelTensorShape get_output_shape(LinearAttrs const &atts,
                                      ParallelTensorShape const &input) {
-                          
+
   ParallelTensorShape out_shape = input;
-  if(input.num_dims() != 2) {
+  if (input.num_dims() != 2) {
     throw mk_runtime_error("LinearAttrs: input shape should be 2D");
   }
 
   out_shape.at(ff_dim_t(1)).size = atts.out_channels;
-  //linear shoud consider the degree
-  //case 1: input:[N, K], weight:[K, M], degree is 1
-  if(input.at(ff_dim_t(0)).degree == 1 && input.at(ff_dim_t(1)).degree == 1  ) {
+  // linear shoud consider the degree
+  // case 1: input:[N, K], weight:[K, M], degree is 1
+  if (input.at(ff_dim_t(0)).degree == 1 && input.at(ff_dim_t(1)).degree == 1) {
     out_shape.at(ff_dim_t(0)).degree = 1;
-    for(int i = 0; i < input.num_dims(); i++) {
+    for (int i = 0; i < input.num_dims(); i++) {
       out_shape.at(ff_dim_t(i)).is_replica_dim = false;
       out_shape.at(ff_dim_t(i)).degree = 1;
     }
-  } else if(input.at(ff_dim_t(0)).degree == 1 && input.at(ff_dim_t(1)).degree > 1) {
-    //case 2: input [N, k/x], weight [k/x, M], output [N, M], degree is x
+  } else if (input.at(ff_dim_t(0)).degree == 1 &&
+             input.at(ff_dim_t(1)).degree > 1) {
+    // case 2: input [N, k/x], weight [k/x, M], output [N, M], degree is x
     out_shape.at(ff_dim_t(1)).degree = input.at(ff_dim_t(1)).degree;
     out_shape.at(ff_dim_t(1)).is_replica_dim = true;
-  } else if(input.at(ff_dim_t(0)).degree > 1 && input.at(ff_dim_t(1)).degree == 1)  {
-    //case 3: input [N/X, K], weight [K, M/X], output [N/X, M], degree is X  
+  } else if (input.at(ff_dim_t(0)).degree > 1 &&
+             input.at(ff_dim_t(1)).degree == 1) {
+    // case 3: input [N/X, K], weight [K, M/X], output [N/X, M], degree is X
     out_shape.at(ff_dim_t(0)).degree = input.at(ff_dim_t(0)).degree;
     out_shape.at(ff_dim_t(0)).is_replica_dim = true;
-  } else if(input.at(ff_dim_t(0)).degree > 1 && input.at(ff_dim_t(1)).degree > 1) {
-    //case 4: input [N/X, K/Y], weight [K/Y, M/X], output [N/X, M/X], degree is X
-    for(int i = 0; i < input.num_dims(); i++) {
+  } else if (input.at(ff_dim_t(0)).degree > 1 &&
+             input.at(ff_dim_t(1)).degree > 1) {
+    // case 4: input [N/X, K/Y], weight [K/Y, M/X], output [N/X, M/X], degree is
+    // X
+    for (int i = 0; i < input.num_dims(); i++) {
       out_shape.at(ff_dim_t(i)).is_replica_dim = true;
       out_shape.at(ff_dim_t(i)).degree = input.at(ff_dim_t(i)).degree;
     }
diff --git a/lib/op-attrs/src/pool_2d.cc b/lib/op-attrs/src/pool_2d.cc
index 6d58210b6a..65754de6e3 100644
--- a/lib/op-attrs/src/pool_2d.cc
+++ b/lib/op-attrs/src/pool_2d.cc
@@ -2,6 +2,7 @@
 #include "op-attrs/ff_dim.h"
 #include "parallel_dim_mapping_record.h"
 #include "parallel_dim_mapping_record_solver.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -47,15 +48,21 @@ bool Pool2DAttrs::is_valid(ParallelTensorShape const &input) const {
   return true;
 }
 
-// pytorch: we have two type of pool2d, maxpool2d and avgpool2d
-// input shape: (batch_size, channels, input_height, input_width)
-// for avgpool2d, output shape: (batch_size, channels, 1, 1)
-// for maxpool2d, output shape: (batch_size, channels, output_height,
-// output_width) output_height = (input_height + 2 * padding_h - kernel_h) /
-// stride_h + 1 output_width = (input_width + 2 * padding_w - kernel_w) /
-// stride_w + 1
+// https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html
+// https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html
+//  pytorch: we have two type of pool2d, maxpool2d and avgpool2d
+//  input shape: (batch_size, channels, input_height, input_width)
+//  for avgpool2d, output shape: (batch_size, channels, 1, 1)
+//  for maxpool2d, output shape: (batch_size, channels, output_height,
+//  output_width) output_height = (input_height + 2 * padding_h - kernel_h) /
+//  stride_h + 1 output_width = (input_width + 2 * padding_w - kernel_w) /
+//  stride_w + 1
 ParallelTensorShape get_output_shape(Pool2DAttrs const &attrs,
                                      ParallelTensorShape const &input) {
+
+  if (input.num_dims() != 4) {
+    throw mk_runtime_error("Pool2DAttrs: input shape should be 4D");
+  }
   ParallelTensorShape output_shape = input;
   if (attrs.pool_type == PoolOp::AVG) {
     output_shape.at(ff_dim_t(2)).size = 1;
@@ -70,8 +77,44 @@ ParallelTensorShape get_output_shape(Pool2DAttrs const &attrs,
             attrs.stride_w +
         1;
   } else {
-    assert(false && "unsupported pool type");
+    throw mk_runtime_error("Pool2DAttrs: pool type is not supported");
+  }
+
+  // case 1: input:[N, C, H, W], output:[N, C, 1, 1], degree is 1 for avgpool2d
+  // input: [N, C, H, W], output: [N, C, output_height, output_width], degree is  1 for maxpool2d
+  if (input.at(ff_dim_t(2)).degree == 1 && input.at(ff_dim_t(3)).degree == 1) {
+    for (int i = 2; i < input.num_dims(); i++) {
+      output_shape.at(ff_dim_t(i)).is_replica_dim = false;
+      output_shape.at(ff_dim_t(i)).degree = 1;
+    }
+  } else if (input.at(ff_dim_t(2)).degree > 1 &&
+             input.at(ff_dim_t(3)).degree == 1) {
+    // case 2: input [N, C, H/X, W] output [N, C, 1, 1], degree is X
+    // input [N, C, H/X, W] output [N, C, output_height/x, output_width], degree  is X
+    output_shape.at(ff_dim_t(2)).degree = input.at(ff_dim_t(2)).degree;
+    output_shape.at(ff_dim_t(2)).is_replica_dim = true;
+    output_shape.at(ff_dim_t(3)).degree = 1;
+    output_shape.at(ff_dim_t(3)).is_replica_dim = false;
+  } else if (input.at(ff_dim_t(2)).degree == 1 &&
+             input.at(ff_dim_t(3)).degree > 1) {
+    // case 3: input [N, C, H, W/X] output [N, C, 1, 1], degree is X
+    // input [N, C, H, W/X] output [N, C, output_height, output_width/x], degree is X
+    output_shape.at(ff_dim_t(2)).degree = 1;
+    output_shape.at(ff_dim_t(2)).is_replica_dim = false;
+    output_shape.at(ff_dim_t(3)).degree = input.at(ff_dim_t(3)).degree;
+    output_shape.at(ff_dim_t(3)).is_replica_dim = true;
+  } else if (input.at(ff_dim_t(2)).degree > 1 &&
+             input.at(ff_dim_t(3)).degree > 1) {
+    // case 4: input [N, C, H/X, W/Y] output [N, C, 1, 1], degree is X and Y for
+    // avgpool2d input [N, C, H/X, W/Y] output [N, C, output_height/x, output_width/y], degree is X and Y for maxpool2d
+    for (int i = 2; i < input.num_dims(); i++) {
+      output_shape.at(ff_dim_t(i)).is_replica_dim = true;
+      output_shape.at(ff_dim_t(i)).degree = input.at(ff_dim_t(i)).degree;
+    }
+  } else {
+    throw mk_runtime_error("Pool2DAttrs: degree is not supported");
   }
+
   return output_shape;
 }
 

From 2408b37046c975500c0407dd7ba2784f736f41cd Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 15:17:09 +0000
Subject: [PATCH 46/69] add repartition

---
 lib/op-attrs/include/op-attrs/ops/reduce.h    |  1 -
 lib/op-attrs/include/op-attrs/ops/reduction.h |  1 -
 .../include/op-attrs/ops/repartition.h        |  1 -
 lib/op-attrs/src/reduction.cc                 |  7 -----
 lib/op-attrs/src/repartition.cc               | 26 ++++++-------------
 5 files changed, 8 insertions(+), 28 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/reduce.h b/lib/op-attrs/include/op-attrs/ops/reduce.h
index c18d4cd888..96827a83cc 100644
--- a/lib/op-attrs/include/op-attrs/ops/reduce.h
+++ b/lib/op-attrs/include/op-attrs/ops/reduce.h
@@ -14,7 +14,6 @@ struct ReduceAttrs {
   stack_vector<ff_dim_t, MAX_TENSOR_DIM> axes;
   req<Op> op_type;
   req<bool> keepdims;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ReduceAttrs, axes, op_type, keepdims);
 CHECK_VALID_OP_ATTR(ReduceAttrs);
diff --git a/lib/op-attrs/include/op-attrs/ops/reduction.h b/lib/op-attrs/include/op-attrs/ops/reduction.h
index a8e7abd318..70f268c97d 100644
--- a/lib/op-attrs/include/op-attrs/ops/reduction.h
+++ b/lib/op-attrs/include/op-attrs/ops/reduction.h
@@ -11,7 +11,6 @@ namespace FlexFlow {
 struct ReductionAttrs {
   ff_dim_t reduction_dim;
   req<int> reduction_degree;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ReductionAttrs, reduction_dim, reduction_degree);
 CHECK_VALID_OP_ATTR(ReductionAttrs);
diff --git a/lib/op-attrs/include/op-attrs/ops/repartition.h b/lib/op-attrs/include/op-attrs/ops/repartition.h
index a795017bf4..8abdc6eb1c 100644
--- a/lib/op-attrs/include/op-attrs/ops/repartition.h
+++ b/lib/op-attrs/include/op-attrs/ops/repartition.h
@@ -11,7 +11,6 @@ namespace FlexFlow {
 struct RepartitionAttrs {
   ff_dim_t repartition_dim;
   req<int> repartition_degree;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(RepartitionAttrs, repartition_dim, repartition_degree);
 CHECK_VALID_OP_ATTR(RepartitionAttrs);
diff --git a/lib/op-attrs/src/reduction.cc b/lib/op-attrs/src/reduction.cc
index 9196000a05..6336e15253 100644
--- a/lib/op-attrs/src/reduction.cc
+++ b/lib/op-attrs/src/reduction.cc
@@ -10,13 +10,6 @@ namespace FlexFlow {
 /*   return output; */
 /* } */
 
-bool ReductionAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  return true;
-}
-
 ParallelTensorShape get_output_shape(ReductionAttrs const &attrs,
                                      ParallelTensorShape const &input_shape) {
   ParallelTensorShape output(input_shape.dims, input_shape.data_type);
diff --git a/lib/op-attrs/src/repartition.cc b/lib/op-attrs/src/repartition.cc
index b5a0280d85..292d90d2e2 100644
--- a/lib/op-attrs/src/repartition.cc
+++ b/lib/op-attrs/src/repartition.cc
@@ -1,27 +1,17 @@
 #include "op-attrs/ops/repartition.h"
 #include "op-attrs/parallel_dim.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-/* bool RepartitionAttrs::is_valid(ParallelTensorShape const &input_shape) const
- * { */
-/*   ParallelDim dim = input_shape.at(this->repartition_legion_dim); */
-/*   return (dim.size % this->repartition_degree * dim.degree == 0); */
-/* } */
-
-bool RepartitionAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  ParallelDim dim = input.at(this->repartition_dim);
-  return (dim.size % this->repartition_degree * dim.degree == 0);
-}
-
-// this may be wrong partition by n multiplies degree by n and keeps shape the
-// same
+// this may be wrong partition by n multiplies degree by n and keeps shape the  same
 ParallelTensorShape get_output_shape(RepartitionAttrs const &attrs,
-                                     ParallelTensorShape const &input_shape) {
-  ParallelTensorShape output(input_shape.dims, input_shape.data_type);
+                                     ParallelTensorShape const &input) {
+  ParallelDim dim = input.at(attrs.repartition_dim);
+  if(dim.size % attrs.repartition_degree * dim.degree != 0) {
+    throw mk_runtime_error("RepartitionAttrs: input.at(attrs.repartition_dim) % attrs.repartition_degree * dim.degree != 0");
+  }
+  ParallelTensorShape output(input.dims, input.data_type);
   output.at(attrs.repartition_dim).degree *= attrs.repartition_degree;
   return output;
 }

From 9c93f0751219a74fb43282eb78442e66fbfc171a Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 15:24:13 +0000
Subject: [PATCH 47/69] remove aggregate

---
 .../src/cuda/aggregate_spec_kernels.cu        | 176 +++++++-----------
 lib/op-attrs/include/op-attrs/ops/reshape.h   |   1 -
 2 files changed, 72 insertions(+), 105 deletions(-)

diff --git a/lib/kernels/src/cuda/aggregate_spec_kernels.cu b/lib/kernels/src/cuda/aggregate_spec_kernels.cu
index d46dc64567..ed4f656131 100644
--- a/lib/kernels/src/cuda/aggregate_spec_kernels.cu
+++ b/lib/kernels/src/cuda/aggregate_spec_kernels.cu
@@ -12,12 +12,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "kernels/aggregate_spec_kernels.h"
 #include "kernels/cuda_helper.h"
-
 namespace FlexFlow {
-
 AggregateSpecPerDeviceState::AggregateSpecPerDeviceState(FFHandler handler,
                                                          int n)
     : PerDeviceOpState(handler) {
@@ -26,10 +23,80 @@ AggregateSpecPerDeviceState::AggregateSpecPerDeviceState(FFHandler handler,
 AggregateSpecPerDeviceState::~AggregateSpecPerDeviceState(void) {
   checkCUDA(cudaFree(&dev_region_ptrs));
 }
-
 namespace Kernels {
 namespace AggregateSpec {
 
+void forward_kernel(cudaStream_t stream,
+                    AggregateSpecPerDeviceState const *m,
+                    float **exp_preds,
+                    int const *acc_gate_assign_ptr,
+                    float *acc_output_ptr,
+                    int n,
+                    int const k,
+                    int rows,
+                    int const batch_size,
+                    int out_dim) {
+
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+
+  // call forward kernel
+  cudaMemcpy(m->dev_region_ptrs,
+             exp_preds,
+             n * sizeof(float *),
+             cudaMemcpyHostToDevice);
+
+  aggspec_forward_kernel<<<GET_BLOCKS(batch_size * k * out_dim),
+                           min(CUDA_NUM_THREADS,
+                               (int)(batch_size * k * out_dim)),
+                           0,
+                           stream>>>(m->dev_region_ptrs,
+                                     acc_gate_assign_ptr,
+                                     acc_output_ptr,
+                                     n,
+                                     k,
+                                     rows,
+                                     batch_size,
+                                     out_dim);
+}
+void backward_kernel(cudaStream_t stream,
+                     AggregateSpecPerDeviceState const *m,
+                     float **exp_grads,
+                     int const *acc_gate_assign_ptr,
+                     int const *acc_true_gate_assign_ptr,
+                     float const *acc_gate_pred_ptr,
+                     float *acc_full_gate_grad_ptr,
+                     float const *acc_output_grad_ptr,
+                     int n,
+                     int const k,
+                     int rows,
+                     float lambda_bal,
+                     int const batch_size,
+                     int out_dim) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  // call backward kernel
+  cudaMemcpy(m->dev_region_ptrs,
+             exp_grads,
+             n * sizeof(float *),
+             cudaMemcpyHostToDevice);
+  aggspec_backward_kernel<<<GET_BLOCKS(batch_size * k * out_dim),
+                            min(CUDA_NUM_THREADS,
+                                (int)(batch_size * k * out_dim)),
+                            0,
+                            stream>>>(m->dev_region_ptrs,
+                                      acc_gate_assign_ptr,
+                                      acc_true_gate_assign_ptr,
+                                      acc_gate_pred_ptr,
+                                      acc_full_gate_grad_ptr,
+                                      acc_output_grad_ptr,
+                                      n,
+                                      k,
+                                      rows,
+                                      lambda_bal,
+                                      batch_size,
+                                      out_dim);
+}
 __global__ void
     aggspec_forward_kernel(float **exp_preds,
                            int const *exp_assign,
@@ -41,7 +108,6 @@ __global__ void
                            int out_dim) {
   __shared__ float
       *chosen_exp_preds[AGGREGATE_SPEC_MAX_K * AGGREGATE_SPEC_MAX_BATCH_SIZE];
-
   // Get pred pointers, single thread per block
   if (threadIdx.x == 0) {
     int expert_idx[AGGREGATE_SPEC_MAX_N] = {0};
@@ -60,9 +126,7 @@ __global__ void
       }
     }
   }
-
   __syncthreads();
-
   // compute output
   CUDA_KERNEL_LOOP(i, k * batch_size * out_dim) {
     if (chosen_exp_preds[i / out_dim] != 0) {
@@ -72,7 +136,6 @@ __global__ void
     }
   }
 }
-
 __device__ void aggspec_backward_kernel_gate(float const *output_grad,
                                              float *full_gate_grads,
                                              int const *expert_assign,
@@ -84,16 +147,12 @@ __device__ void aggspec_backward_kernel_gate(float const *output_grad,
                                              int k,
                                              int n,
                                              int out_dim) {
-
   __shared__ float gate_grad_sum[AGGREGATE_SPEC_MAX_BATCH_SIZE];
-
   // init gate_grad_sum to 0
   CUDA_KERNEL_LOOP(i, batch_size) {
     gate_grad_sum[i] = 0.0f;
   }
-
   __syncthreads();
-
   // get sum of expert errors
   /* NOTE: Errors just squared L2 norm of gradients. * batch_size because the
   expert gradients are /= batch_size and then it would be /= batch_size^2 here
@@ -108,7 +167,6 @@ __device__ void aggspec_backward_kernel_gate(float const *output_grad,
       atomicAdd(gate_grad_sum + i / (k * out_dim), res);
     }
   }
-
   // Compute gate gradients:
   // Assigned expert i, sample j: pred(i,j) - err_(i,j)/sum_l err(l,j)
   __syncthreads();
@@ -118,15 +176,12 @@ __device__ void aggspec_backward_kernel_gate(float const *output_grad,
       full_gate_grads[i / k * n + expert_assign[i]] -= (1.0f - gate_pred[i]);
     }
   }
-
   // balance term
   __syncthreads();
   CUDA_KERNEL_LOOP(i, n * batch_size) {
     full_gate_grads[i] += lambda_bal * expert_bal[i % n];
   }
-
   __syncthreads();
-
   // make 0 mean
   CUDA_KERNEL_LOOP(i, n * batch_size) {
     int start = (i / n) * n;
@@ -136,7 +191,6 @@ __device__ void aggspec_backward_kernel_gate(float const *output_grad,
     }
   }
 }
-
 __device__ void aggspec_backward_kernel_exp(float const *output_grad,
                                             float const *gate_preds,
                                             float **exp_grads,
@@ -151,7 +205,6 @@ __device__ void aggspec_backward_kernel_exp(float const *output_grad,
     }
   }
 }
-
 __global__ void
     aggspec_backward_kernel(float **exp_grads,
                             int const *exp_assign,
@@ -169,7 +222,6 @@ __global__ void
       *chosen_exp_grads[AGGREGATE_SPEC_MAX_K * AGGREGATE_SPEC_MAX_BATCH_SIZE];
   __shared__ int expert_bal[AGGREGATE_SPEC_MAX_N];
   __shared__ bool cache_corr[AGGREGATE_SPEC_MAX_BATCH_SIZE];
-
   // Get pred pointers, single thread per block
   if (threadIdx.x == 0) {
     // init arrays
@@ -179,7 +231,6 @@ __global__ void
     for (int i = 0; i < batch_size; i++) {
       cache_corr[i] = true;
     }
-
     // Get pointer to chosen expert grads and expert counts
     for (int i = 0; i < batch_size; i++) {
       for (int j = 0; j < k; j++) {
@@ -199,14 +250,11 @@ __global__ void
       }
     }
   }
-
   __syncthreads();
-
   // NOTE: These 2 functions could execute independently in parallel
   // get expert gradients
   aggspec_backward_kernel_exp(
       output_grads, gating_net_preds, chosen_exp_grads, batch_size, k, out_dim);
-
   // get gating net gradients
   aggspec_backward_kernel_gate(output_grads,
                                full_gating_grads,
@@ -219,84 +267,4 @@ __global__ void
                                k,
                                n,
                                out_dim);
-}
-
-void forward_kernel(cudaStream_t stream,
-                    AggregateSpecPerDeviceState const *m,
-                    float **exp_preds,
-                    int const *acc_gate_assign_ptr,
-                    float *acc_output_ptr,
-                    int n,
-                    int const k,
-                    int rows,
-                    int const batch_size,
-                    int out_dim) {
-
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-
-  // call forward kernel
-  cudaMemcpy(m->dev_region_ptrs,
-             exp_preds,
-             n * sizeof(float *),
-             cudaMemcpyHostToDevice);
-
-  aggspec_forward_kernel<<<GET_BLOCKS(batch_size * k * out_dim),
-                           min(CUDA_NUM_THREADS,
-                               (int)(batch_size * k * out_dim)),
-                           0,
-                           stream>>>(m->dev_region_ptrs,
-                                     acc_gate_assign_ptr,
-                                     acc_output_ptr,
-                                     n,
-                                     k,
-                                     rows,
-                                     batch_size,
-                                     out_dim);
-}
-
-void backward_kernel(cudaStream_t stream,
-                     AggregateSpecPerDeviceState const *m,
-                     float **exp_grads,
-                     int const *acc_gate_assign_ptr,
-                     int const *acc_true_gate_assign_ptr,
-                     float const *acc_gate_pred_ptr,
-                     float *acc_full_gate_grad_ptr,
-                     float const *acc_output_grad_ptr,
-                     int n,
-                     int const k,
-                     int rows,
-                     float lambda_bal,
-                     int const batch_size,
-                     int out_dim) {
-
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-
-  // call backward kernel
-  cudaMemcpy(m->dev_region_ptrs,
-             exp_grads,
-             n * sizeof(float *),
-             cudaMemcpyHostToDevice);
-
-  aggspec_backward_kernel<<<GET_BLOCKS(batch_size * k * out_dim),
-                            min(CUDA_NUM_THREADS,
-                                (int)(batch_size * k * out_dim)),
-                            0,
-                            stream>>>(m->dev_region_ptrs,
-                                      acc_gate_assign_ptr,
-                                      acc_true_gate_assign_ptr,
-                                      acc_gate_pred_ptr,
-                                      acc_full_gate_grad_ptr,
-                                      acc_output_grad_ptr,
-                                      n,
-                                      k,
-                                      rows,
-                                      lambda_bal,
-                                      batch_size,
-                                      out_dim);
-}
-
-} // namespace AggregateSpec
-} // namespace Kernels
-} // namespace FlexFlow
+}
\ No newline at end of file
diff --git a/lib/op-attrs/include/op-attrs/ops/reshape.h b/lib/op-attrs/include/op-attrs/ops/reshape.h
index 7fbe573c93..78b9806fe7 100644
--- a/lib/op-attrs/include/op-attrs/ops/reshape.h
+++ b/lib/op-attrs/include/op-attrs/ops/reshape.h
@@ -10,7 +10,6 @@ namespace FlexFlow {
 
 struct ReshapeAttrs {
   TensorShape shape;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ReshapeAttrs, shape);
 CHECK_VALID_OP_ATTR(ReshapeAttrs);

From 7377bee136b081af9640d115454be08ff5d85599 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 15:24:46 +0000
Subject: [PATCH 48/69] remove aggregate

---
 .../src/cuda/aggregate_spec_kernels.cu        | 270 ------------------
 1 file changed, 270 deletions(-)
 delete mode 100644 lib/kernels/src/cuda/aggregate_spec_kernels.cu

diff --git a/lib/kernels/src/cuda/aggregate_spec_kernels.cu b/lib/kernels/src/cuda/aggregate_spec_kernels.cu
deleted file mode 100644
index ed4f656131..0000000000
--- a/lib/kernels/src/cuda/aggregate_spec_kernels.cu
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "kernels/aggregate_spec_kernels.h"
-#include "kernels/cuda_helper.h"
-namespace FlexFlow {
-AggregateSpecPerDeviceState::AggregateSpecPerDeviceState(FFHandler handler,
-                                                         int n)
-    : PerDeviceOpState(handler) {
-  checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *)));
-}
-AggregateSpecPerDeviceState::~AggregateSpecPerDeviceState(void) {
-  checkCUDA(cudaFree(&dev_region_ptrs));
-}
-namespace Kernels {
-namespace AggregateSpec {
-
-void forward_kernel(cudaStream_t stream,
-                    AggregateSpecPerDeviceState const *m,
-                    float **exp_preds,
-                    int const *acc_gate_assign_ptr,
-                    float *acc_output_ptr,
-                    int n,
-                    int const k,
-                    int rows,
-                    int const batch_size,
-                    int out_dim) {
-
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-
-  // call forward kernel
-  cudaMemcpy(m->dev_region_ptrs,
-             exp_preds,
-             n * sizeof(float *),
-             cudaMemcpyHostToDevice);
-
-  aggspec_forward_kernel<<<GET_BLOCKS(batch_size * k * out_dim),
-                           min(CUDA_NUM_THREADS,
-                               (int)(batch_size * k * out_dim)),
-                           0,
-                           stream>>>(m->dev_region_ptrs,
-                                     acc_gate_assign_ptr,
-                                     acc_output_ptr,
-                                     n,
-                                     k,
-                                     rows,
-                                     batch_size,
-                                     out_dim);
-}
-void backward_kernel(cudaStream_t stream,
-                     AggregateSpecPerDeviceState const *m,
-                     float **exp_grads,
-                     int const *acc_gate_assign_ptr,
-                     int const *acc_true_gate_assign_ptr,
-                     float const *acc_gate_pred_ptr,
-                     float *acc_full_gate_grad_ptr,
-                     float const *acc_output_grad_ptr,
-                     int n,
-                     int const k,
-                     int rows,
-                     float lambda_bal,
-                     int const batch_size,
-                     int out_dim) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  // call backward kernel
-  cudaMemcpy(m->dev_region_ptrs,
-             exp_grads,
-             n * sizeof(float *),
-             cudaMemcpyHostToDevice);
-  aggspec_backward_kernel<<<GET_BLOCKS(batch_size * k * out_dim),
-                            min(CUDA_NUM_THREADS,
-                                (int)(batch_size * k * out_dim)),
-                            0,
-                            stream>>>(m->dev_region_ptrs,
-                                      acc_gate_assign_ptr,
-                                      acc_true_gate_assign_ptr,
-                                      acc_gate_pred_ptr,
-                                      acc_full_gate_grad_ptr,
-                                      acc_output_grad_ptr,
-                                      n,
-                                      k,
-                                      rows,
-                                      lambda_bal,
-                                      batch_size,
-                                      out_dim);
-}
-__global__ void
-    aggspec_forward_kernel(float **exp_preds,
-                           int const *exp_assign,
-                           float *output,
-                           int n,           // num experts
-                           int const k,     // num chosen experts
-                           int exp_samples, // max samples per expert
-                           int const batch_size,
-                           int out_dim) {
-  __shared__ float
-      *chosen_exp_preds[AGGREGATE_SPEC_MAX_K * AGGREGATE_SPEC_MAX_BATCH_SIZE];
-  // Get pred pointers, single thread per block
-  if (threadIdx.x == 0) {
-    int expert_idx[AGGREGATE_SPEC_MAX_N] = {0};
-    for (int i = 0; i < batch_size; i++) {
-      for (int j = 0; j < k; j++) {
-        // Get pointer to chosen expert predictions
-        int expert = exp_assign[i * k + j];
-        if (expert_idx[expert] >= exp_samples) {
-          // dropped sample
-          chosen_exp_preds[i * k + j] = 0;
-          continue;
-        }
-        chosen_exp_preds[i * k + j] =
-            exp_preds[expert] + expert_idx[expert] * out_dim;
-        expert_idx[expert]++;
-      }
-    }
-  }
-  __syncthreads();
-  // compute output
-  CUDA_KERNEL_LOOP(i, k * batch_size * out_dim) {
-    if (chosen_exp_preds[i / out_dim] != 0) {
-      output[i] = chosen_exp_preds[i / out_dim][i % out_dim];
-    } else {
-      output[i] = 0.0f;
-    }
-  }
-}
-__device__ void aggspec_backward_kernel_gate(float const *output_grad,
-                                             float *full_gate_grads,
-                                             int const *expert_assign,
-                                             bool const *cache_corr,
-                                             float const *gate_pred,
-                                             int *expert_bal,
-                                             float lambda_bal,
-                                             int batch_size,
-                                             int k,
-                                             int n,
-                                             int out_dim) {
-  __shared__ float gate_grad_sum[AGGREGATE_SPEC_MAX_BATCH_SIZE];
-  // init gate_grad_sum to 0
-  CUDA_KERNEL_LOOP(i, batch_size) {
-    gate_grad_sum[i] = 0.0f;
-  }
-  __syncthreads();
-  // get sum of expert errors
-  /* NOTE: Errors just squared L2 norm of gradients. * batch_size because the
-  expert gradients are /= batch_size and then it would be /= batch_size^2 here
-*/
-  CUDA_KERNEL_LOOP(i, batch_size * k * out_dim) {
-    if (cache_corr[i / (k * out_dim)]) {
-      float res = output_grad[i] * output_grad[i] * batch_size;
-      float *gate_grad_idx =
-          full_gate_grads + (i / (out_dim * k)) * n +
-          expert_assign[(i / (out_dim * k)) * k + (i / out_dim) % k];
-      atomicAdd(gate_grad_idx, res);
-      atomicAdd(gate_grad_sum + i / (k * out_dim), res);
-    }
-  }
-  // Compute gate gradients:
-  // Assigned expert i, sample j: pred(i,j) - err_(i,j)/sum_l err(l,j)
-  __syncthreads();
-  CUDA_KERNEL_LOOP(i, k * batch_size) {
-    if (cache_corr[i / k]) {
-      full_gate_grads[i / k * n + expert_assign[i]] /= gate_grad_sum[i / k];
-      full_gate_grads[i / k * n + expert_assign[i]] -= (1.0f - gate_pred[i]);
-    }
-  }
-  // balance term
-  __syncthreads();
-  CUDA_KERNEL_LOOP(i, n * batch_size) {
-    full_gate_grads[i] += lambda_bal * expert_bal[i % n];
-  }
-  __syncthreads();
-  // make 0 mean
-  CUDA_KERNEL_LOOP(i, n * batch_size) {
-    int start = (i / n) * n;
-    float sub = -full_gate_grads[i] / n;
-    for (int j = 0; j < n; j++) {
-      atomicAdd(full_gate_grads + start + j, sub);
-    }
-  }
-}
-__device__ void aggspec_backward_kernel_exp(float const *output_grad,
-                                            float const *gate_preds,
-                                            float **exp_grads,
-                                            int batch_size,
-                                            int k,
-                                            int out_dim) {
-  // compute expert gradients
-  CUDA_KERNEL_LOOP(i, k * out_dim * batch_size) {
-    if (exp_grads[i / out_dim] != 0) {
-      exp_grads[i / out_dim][i % out_dim] +=
-          gate_preds[i / out_dim] * output_grad[i];
-    }
-  }
-}
-__global__ void
-    aggspec_backward_kernel(float **exp_grads,
-                            int const *exp_assign,
-                            int const *true_exp_assign,
-                            float const *gating_net_preds,
-                            float *full_gating_grads,
-                            float const *output_grads,
-                            int n,           // num experts
-                            int k,           // num chosen experts
-                            int exp_samples, // max samples per expert
-                            float lambda_bal,
-                            int batch_size,
-                            int out_dim) {
-  __shared__ float
-      *chosen_exp_grads[AGGREGATE_SPEC_MAX_K * AGGREGATE_SPEC_MAX_BATCH_SIZE];
-  __shared__ int expert_bal[AGGREGATE_SPEC_MAX_N];
-  __shared__ bool cache_corr[AGGREGATE_SPEC_MAX_BATCH_SIZE];
-  // Get pred pointers, single thread per block
-  if (threadIdx.x == 0) {
-    // init arrays
-    for (int i = 0; i < n; i++) {
-      expert_bal[i] = 0;
-    }
-    for (int i = 0; i < batch_size; i++) {
-      cache_corr[i] = true;
-    }
-    // Get pointer to chosen expert grads and expert counts
-    for (int i = 0; i < batch_size; i++) {
-      for (int j = 0; j < k; j++) {
-        int expert = true_exp_assign[k * i + j];
-        if (expert != exp_assign[k * i + j]) {
-          cache_corr[i] = false;
-        }
-        if (expert_bal[expert] >= exp_samples) {
-          // dropped sample
-          chosen_exp_grads[i * k + j] = 0;
-          expert_bal[expert]++;
-          continue;
-        }
-        chosen_exp_grads[i * k + j] =
-            exp_grads[expert] + expert_bal[expert] * out_dim;
-        expert_bal[expert]++;
-      }
-    }
-  }
-  __syncthreads();
-  // NOTE: These 2 functions could execute independently in parallel
-  // get expert gradients
-  aggspec_backward_kernel_exp(
-      output_grads, gating_net_preds, chosen_exp_grads, batch_size, k, out_dim);
-  // get gating net gradients
-  aggspec_backward_kernel_gate(output_grads,
-                               full_gating_grads,
-                               exp_assign,
-                               cache_corr,
-                               gating_net_preds,
-                               expert_bal,
-                               (lambda_bal * n) / batch_size,
-                               batch_size,
-                               k,
-                               n,
-                               out_dim);
-}
\ No newline at end of file

From a5f1a0e3629e3c432de44d6bfaf468d517a79642 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 15:50:24 +0000
Subject: [PATCH 49/69] add reshape

---
 lib/op-attrs/src/reshape.cc | 51 ++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/lib/op-attrs/src/reshape.cc b/lib/op-attrs/src/reshape.cc
index e100efeadb..c715c8dada 100644
--- a/lib/op-attrs/src/reshape.cc
+++ b/lib/op-attrs/src/reshape.cc
@@ -1,34 +1,45 @@
 #include "op-attrs/ops/reshape.h"
 #include "op-attrs/ff_dim.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-// pytorch: the input: [2,3,4], shape maybe [-1,6]， should we add this? and the
-// output is [4, 6]
-bool ReshapeAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  std::size_t input_volume = 1;
-  for (int i = 0; i < input.num_dims(); i++) {
-    input_volume *= input.at(ff_dim_t(i)).size;
-  }
-  std::size_t attrs_volume = 1;
-  for (int i = 0; i < this->shape.dims.num_dims(); i++) {
-    attrs_volume *= this->shape.at(ff_dim_t(i));
-  }
-  return (input_volume == attrs_volume);
-}
-
+//https://pytorch.org/docs/stable/generated/torch.reshape.html
 // pytorch: the input: [2,3,4], shape maybe [-1,6]， should we add this? and the
 // output is [4, 6] currently we doesn't consider the case of -1,we can support
 // this later the input:[2,3,4], attrs.shape:[4,6], the output is [4, 6]
 ParallelTensorShape get_output_shape(ReshapeAttrs const &attrs,
                                      ParallelTensorShape const &input) {
+  std::size_t input_volume = input.dims.get_volume();
+  std::size_t attrs_volume = 1;
+  for (int i = 0; i < attrs.shape.dims.num_dims(); i++) {
+    attrs_volume *= attrs.shape.at(ff_dim_t(i));
+  }
+  if(input_volume != attrs_volume) {
+    throw mk_runtime_error("ReshapeAttrs: input_volume != attrs_volume");
+  }
 
-  assert(attrs.is_valid(input) && "input is not valid");
-  ParallelTensorDims dims{attrs.shape.dims};
-  ParallelTensorShape output{dims, input.data_type};
+  ParallelTensorShape output = input;
+  output.data_type = input.data_type;
+  if(attrs.shape.dims.num_dims() == 1) {
+      //infer the shape
+      if(attrs.shape.at(ff_dim_t(0)) == -1) {
+       
+        output.at(ff_dim_t(0)).size = input_volume ;
+        output.at(ff_dim_t(0)).degree = 1;
+        output.at(ff_dim_t(0)).is_replica_dim = false;
+      } else {
+        output.at(ff_dim_t(0)).size = attrs.shape.at(ff_dim_t(0));
+        output.at(ff_dim_t(1)).size = input_volume / attrs.shape.at(ff_dim_t(0));
+        for(int i = 0; i < 2; i++) {
+          output.at(ff_dim_t(i)).degree = 1;
+          output.at(ff_dim_t(i)).is_replica_dim = false;
+        }
+      }
+  } else {
+      ParallelTensorDims dims{attrs.shape.dims};
+      output = {dims, input.data_type};
+  }
   return output;
 }
 

From 812708f50d68d31e12961a6a0009722cab7a2275 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 15:58:24 +0000
Subject: [PATCH 50/69] conv2d done

---
 lib/op-attrs/src/batch_matmul.cc | 15 ++++++++++-----
 lib/op-attrs/src/batch_norm.cc   |  1 +
 lib/op-attrs/src/concat.cc       |  3 +++
 lib/op-attrs/src/conv_2d.cc      | 13 ++++++++++---
 4 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index c4ff074302..f1c0e12968 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -37,22 +37,27 @@ ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
 
   if (lhs.at(ff_dim_t(1)).degree == 1 && lhs.at(ff_dim_t(2)).degree == 1) {
     // case 0: degree is 1, [b, n, m], rhs: [b, m, p] -> [b, n, p]
-    output_shape.at(ff_dim_t(0)).is_replica_dim = false;
+    for(int i =1; i < lhs.num_dims(); i++) {
+      output_shape.at(ff_dim_t(i)).degree = 1;
+      output_shape.at(ff_dim_t(i)).is_replica_dim = false;
+    }
   } else if (lhs.at(ff_dim_t(1)).degree == 1 &&
              lhs.at(ff_dim_t(2)).degree >
                  1) { // case 1: [b, n, m/x], [b, m/x, p] => [b, n, y]
-    output_shape.at(ff_dim_t(0)).is_replica_dim = true;
+    output_shape.at(ff_dim_t(1)).is_replica_dim = true;
     output_shape.at(ff_dim_t(1)).degree = lhs.at(ff_dim_t(1)).degree;
   } else if (lhs.at(ff_dim_t(1)).degree > 1 &&
              lhs.at(ff_dim_t(2)).degree ==
                  1) { // case 2: [b, n/x, m] [b m p/x] => [b n/x p/x]
-    output_shape.at(ff_dim_t(0)).is_replica_dim = true;
-    output_shape.at(ff_dim_t(1)).degree = rhs.at(ff_dim_t(1)).degree;
+    output_shape.at(ff_dim_t(1)).is_replica_dim = true;
+    output_shape.at(ff_dim_t(2)).is_replica_dim = true;
+    output_shape.at(ff_dim_t(1)).degree = lhs.at(ff_dim_t(1)).degree;
     output_shape.at(ff_dim_t(2)).degree = rhs.at(ff_dim_t(2)).degree;
   } else if (lhs.at(ff_dim_t(1)).degree > 1 &&
              lhs.at(ff_dim_t(2)).degree >
                  1) { // case 3: [b n/x m/y] [b m/y p/x]=> [b n/x p/x]
-    output_shape.at(ff_dim_t(0)).is_replica_dim = true;
+    output_shape.at(ff_dim_t(1)).is_replica_dim = true;
+    output_shape.at(ff_dim_t(2)).is_replica_dim = true;
     output_shape.at(ff_dim_t(1)).degree = lhs.at(ff_dim_t(1)).degree;
     output_shape.at(ff_dim_t(2)).degree = rhs.at(ff_dim_t(2)).degree;
   } else {
diff --git a/lib/op-attrs/src/batch_norm.cc b/lib/op-attrs/src/batch_norm.cc
index be11ac0e13..777cca1df6 100644
--- a/lib/op-attrs/src/batch_norm.cc
+++ b/lib/op-attrs/src/batch_norm.cc
@@ -12,6 +12,7 @@ ParallelTensorShape get_output_shape(BatchNormAttrs const &attrs,
         "BatchNormAttrs::get_output_shape: input is invalid");
   }
   ParallelTensorShape output_shape = input;
+  //the degree of the output is the same as the input
   return output_shape;
 }
 
diff --git a/lib/op-attrs/src/concat.cc b/lib/op-attrs/src/concat.cc
index 2f65ac5623..5c9dc3e370 100644
--- a/lib/op-attrs/src/concat.cc
+++ b/lib/op-attrs/src/concat.cc
@@ -16,6 +16,9 @@ ParallelTensorShape
   for (auto &i : inputs) {
     output.at(attrs.axis).size += i.at(attrs.axis).size;
   }
+  output.at(attrs.axis).degree = inputs[0].at(attrs.axis).degree;
+  output.at(attrs.axis).is_replica_dim = inputs[0].at(attrs.axis).degree >= 1;
+  return output;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index 08e8315952..d201c64e3d 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -1,5 +1,6 @@
 #include "op-attrs/ops/conv_2d.h"
 #include "op-attrs/ff_dim.h"
+#include "op-attrs/parallel_tensor_shape.h"
 #include "parallel_dim_mapping_record.h"
 #include "parallel_dim_mapping_record_solver.h"
 #include "utils/exception.h"
@@ -111,6 +112,7 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
       (input.at(ff_dim_t(3)).size + 2 * attrs.padding_w - attrs.kernel_w) /
           attrs.stride_w +
       1;
+  
   if (input.at(ff_dim_t(2)).size == 1 && input.at(ff_dim_t(3)).size == 1) {
     // case 1  input degree is 1, like 1GPU
     output.at(ff_dim_t(0)).is_replica_dim = false;
@@ -119,16 +121,21 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
     // case 2: [b, input_channel, input_h/x, input_w], [output_channel,
     // input_channel, kernel_h, kernel_w] => [b, output_channel, output_h/x,
     // output_w]
-    output.at(ff_dim_t(0)).is_replica_dim = true;
+    output.at(ff_dim_t(2)).is_replica_dim = true;
     output.at(ff_dim_t(2)).degree = input.at(ff_dim_t(2)).degree;
-    output.at(ff_dim_t(3)).degree = input.at(ff_dim_t(3)).degree;
   } else if (input.at(ff_dim_t(2)).size == 1 &&
              input.at(ff_dim_t(3)).size > 1) {
     // case 3: [b, input_channel, input_h, input_w / x] [output_channel,
     // input_channel, kernel_h, kernel_w / x] => [b, output_channel, output_h,
     // output_w / x]
-    output.at(ff_dim_t(0)).is_replica_dim = true;
+    output.at(ff_dim_t(3)).is_replica_dim = true;
     output.at(ff_dim_t(3)).degree = input.at(ff_dim_t(3)).degree;
+  } else if(input.at(ff_dim_t(2)).size >1  &&
+             input.at(ff_dim_t(3)).size > 1) {
+      for(int i =2; i < input.num_dims();i++) {
+        output.at(ff_dim_t(i)).is_replica_dim = true;
+        output.at(ff_dim_t(i)).degree = input.at(ff_dim_t(i)).degree;
+      }
   } else {
     throw mk_runtime_error("Conv2DAttrs::get_output_shape: not supported in "
                            "Conv2DAttrs get_output_shape");

From 2bdaf0991e09196861068ebe9669727d1278769b Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 18 Oct 2023 15:23:24 +0000
Subject: [PATCH 51/69] add more shape

---
 lib/op-attrs/include/op-attrs/ops/reverse.h |  1 -
 lib/op-attrs/include/op-attrs/ops/softmax.h |  1 -
 lib/op-attrs/include/op-attrs/ops/split.h   |  1 -
 lib/op-attrs/include/op-attrs/ops/topk.h    |  2 -
 lib/op-attrs/src/batch_matmul.cc            |  2 +-
 lib/op-attrs/src/batch_norm.cc              |  2 +-
 lib/op-attrs/src/conv_2d.cc                 | 13 +++---
 lib/op-attrs/src/embedding.cc               |  2 +-
 lib/op-attrs/src/flat.cc                    |  2 +
 lib/op-attrs/src/linear.cc                  |  1 -
 lib/op-attrs/src/pool_2d.cc                 | 12 ++++--
 lib/op-attrs/src/repartition.cc             |  8 ++--
 lib/op-attrs/src/reshape.cc                 | 44 +++++++++++----------
 lib/op-attrs/src/reverse.cc                 | 15 +++----
 lib/op-attrs/src/softmax.cc                 | 15 ++-----
 lib/op-attrs/src/split.cc                   | 29 ++++++--------
 lib/op-attrs/src/topk.cc                    | 21 +++++-----
 lib/op-attrs/src/transpose.cc               | 37 ++++++++---------
 18 files changed, 95 insertions(+), 113 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/reverse.h b/lib/op-attrs/include/op-attrs/ops/reverse.h
index 0c8657c6ec..ce1295f437 100644
--- a/lib/op-attrs/include/op-attrs/ops/reverse.h
+++ b/lib/op-attrs/include/op-attrs/ops/reverse.h
@@ -10,7 +10,6 @@ namespace FlexFlow {
 
 struct ReverseAttrs {
   ff_dim_t axis;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ReverseAttrs, axis);
 CHECK_VALID_OP_ATTR(ReverseAttrs);
diff --git a/lib/op-attrs/include/op-attrs/ops/softmax.h b/lib/op-attrs/include/op-attrs/ops/softmax.h
index 8e7a00e661..8f31bccdef 100644
--- a/lib/op-attrs/include/op-attrs/ops/softmax.h
+++ b/lib/op-attrs/include/op-attrs/ops/softmax.h
@@ -10,7 +10,6 @@ namespace FlexFlow {
 
 struct SoftmaxAttrs {
   ff_dim_t dim;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(SoftmaxAttrs, dim);
 CHECK_VALID_OP_ATTR(SoftmaxAttrs);
diff --git a/lib/op-attrs/include/op-attrs/ops/split.h b/lib/op-attrs/include/op-attrs/ops/split.h
index e2abeb2581..14f9395a26 100644
--- a/lib/op-attrs/include/op-attrs/ops/split.h
+++ b/lib/op-attrs/include/op-attrs/ops/split.h
@@ -10,7 +10,6 @@ namespace FlexFlow {
 struct SplitAttrs {
   req<stack_vector<int, MAX_NUM_OUTPUTS>> splits;
   ff_dim_t axis;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(SplitAttrs, splits, axis);
 CHECK_VALID_OP_ATTR(SplitAttrs);
diff --git a/lib/op-attrs/include/op-attrs/ops/topk.h b/lib/op-attrs/include/op-attrs/ops/topk.h
index 914ac1afc2..3a3b49ab3b 100644
--- a/lib/op-attrs/include/op-attrs/ops/topk.h
+++ b/lib/op-attrs/include/op-attrs/ops/topk.h
@@ -7,13 +7,11 @@
 
 namespace FlexFlow {
 
-// I think we should add axis
 // pytorch code: torch.topk(input_tensor, k, largest=True, sorted=True, dim=dim)
 struct TopKAttrs {
   req<int> k;
   req<bool> sorted;
   req<int> axis;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(TopKAttrs, k, sorted, axis);
 CHECK_VALID_OP_ATTR(TopKAttrs);
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index f1c0e12968..deabdcea5b 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -37,7 +37,7 @@ ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
 
   if (lhs.at(ff_dim_t(1)).degree == 1 && lhs.at(ff_dim_t(2)).degree == 1) {
     // case 0: degree is 1, [b, n, m], rhs: [b, m, p] -> [b, n, p]
-    for(int i =1; i < lhs.num_dims(); i++) {
+    for (int i = 1; i < lhs.num_dims(); i++) {
       output_shape.at(ff_dim_t(i)).degree = 1;
       output_shape.at(ff_dim_t(i)).is_replica_dim = false;
     }
diff --git a/lib/op-attrs/src/batch_norm.cc b/lib/op-attrs/src/batch_norm.cc
index 777cca1df6..2f050814c9 100644
--- a/lib/op-attrs/src/batch_norm.cc
+++ b/lib/op-attrs/src/batch_norm.cc
@@ -12,7 +12,7 @@ ParallelTensorShape get_output_shape(BatchNormAttrs const &attrs,
         "BatchNormAttrs::get_output_shape: input is invalid");
   }
   ParallelTensorShape output_shape = input;
-  //the degree of the output is the same as the input
+  // the degree of the output is the same as the input
   return output_shape;
 }
 
diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index d201c64e3d..15566f9005 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -112,7 +112,7 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
       (input.at(ff_dim_t(3)).size + 2 * attrs.padding_w - attrs.kernel_w) /
           attrs.stride_w +
       1;
-  
+
   if (input.at(ff_dim_t(2)).size == 1 && input.at(ff_dim_t(3)).size == 1) {
     // case 1  input degree is 1, like 1GPU
     output.at(ff_dim_t(0)).is_replica_dim = false;
@@ -130,12 +130,11 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
     // output_w / x]
     output.at(ff_dim_t(3)).is_replica_dim = true;
     output.at(ff_dim_t(3)).degree = input.at(ff_dim_t(3)).degree;
-  } else if(input.at(ff_dim_t(2)).size >1  &&
-             input.at(ff_dim_t(3)).size > 1) {
-      for(int i =2; i < input.num_dims();i++) {
-        output.at(ff_dim_t(i)).is_replica_dim = true;
-        output.at(ff_dim_t(i)).degree = input.at(ff_dim_t(i)).degree;
-      }
+  } else if (input.at(ff_dim_t(2)).size > 1 && input.at(ff_dim_t(3)).size > 1) {
+    for (int i = 2; i < input.num_dims(); i++) {
+      output.at(ff_dim_t(i)).is_replica_dim = true;
+      output.at(ff_dim_t(i)).degree = input.at(ff_dim_t(i)).degree;
+    }
   } else {
     throw mk_runtime_error("Conv2DAttrs::get_output_shape: not supported in "
                            "Conv2DAttrs get_output_shape");
diff --git a/lib/op-attrs/src/embedding.cc b/lib/op-attrs/src/embedding.cc
index fa8f313457..5e86335f14 100644
--- a/lib/op-attrs/src/embedding.cc
+++ b/lib/op-attrs/src/embedding.cc
@@ -13,6 +13,6 @@ ParallelTensorShape get_output_shape(EmbeddingAttrs const &atts,
   output.at(ff_dim_t(2)).size = atts.out_channels;
   // output degree is same as input degree
   return output;
-} // namespace FlexFlow
+}
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/flat.cc b/lib/op-attrs/src/flat.cc
index ae351328b7..51c29ec5b7 100644
--- a/lib/op-attrs/src/flat.cc
+++ b/lib/op-attrs/src/flat.cc
@@ -27,6 +27,8 @@ ParallelTensorShape get_output_shape(FlatAttrs const &attrs,
       input.at(ff_dim_t(Input::WIDTH)).size;
   output_shape.at(ff_dim_t(Output::CHANNEL)).degree =
       input.at(ff_dim_t(Input::CHANNEL)).degree;
+  output_shape.at(ff_dim_t(Output::CHANNEL)).is_replica_dim =
+      (input.at(ff_dim_t(Input::CHANNEL)).degree > 1);
 
   return output_shape;
 }
diff --git a/lib/op-attrs/src/linear.cc b/lib/op-attrs/src/linear.cc
index 3bb8e0f3ae..ec0f5dd235 100644
--- a/lib/op-attrs/src/linear.cc
+++ b/lib/op-attrs/src/linear.cc
@@ -23,7 +23,6 @@ ParallelTensorShape get_output_shape(LinearAttrs const &atts,
   // linear shoud consider the degree
   // case 1: input:[N, K], weight:[K, M], degree is 1
   if (input.at(ff_dim_t(0)).degree == 1 && input.at(ff_dim_t(1)).degree == 1) {
-    out_shape.at(ff_dim_t(0)).degree = 1;
     for (int i = 0; i < input.num_dims(); i++) {
       out_shape.at(ff_dim_t(i)).is_replica_dim = false;
       out_shape.at(ff_dim_t(i)).degree = 1;
diff --git a/lib/op-attrs/src/pool_2d.cc b/lib/op-attrs/src/pool_2d.cc
index 65754de6e3..b3859c6e06 100644
--- a/lib/op-attrs/src/pool_2d.cc
+++ b/lib/op-attrs/src/pool_2d.cc
@@ -81,7 +81,8 @@ ParallelTensorShape get_output_shape(Pool2DAttrs const &attrs,
   }
 
   // case 1: input:[N, C, H, W], output:[N, C, 1, 1], degree is 1 for avgpool2d
-  // input: [N, C, H, W], output: [N, C, output_height, output_width], degree is  1 for maxpool2d
+  // input: [N, C, H, W], output: [N, C, output_height, output_width], degree is
+  // 1 for maxpool2d
   if (input.at(ff_dim_t(2)).degree == 1 && input.at(ff_dim_t(3)).degree == 1) {
     for (int i = 2; i < input.num_dims(); i++) {
       output_shape.at(ff_dim_t(i)).is_replica_dim = false;
@@ -90,7 +91,8 @@ ParallelTensorShape get_output_shape(Pool2DAttrs const &attrs,
   } else if (input.at(ff_dim_t(2)).degree > 1 &&
              input.at(ff_dim_t(3)).degree == 1) {
     // case 2: input [N, C, H/X, W] output [N, C, 1, 1], degree is X
-    // input [N, C, H/X, W] output [N, C, output_height/x, output_width], degree  is X
+    // input [N, C, H/X, W] output [N, C, output_height/x, output_width], degree
+    // is X
     output_shape.at(ff_dim_t(2)).degree = input.at(ff_dim_t(2)).degree;
     output_shape.at(ff_dim_t(2)).is_replica_dim = true;
     output_shape.at(ff_dim_t(3)).degree = 1;
@@ -98,7 +100,8 @@ ParallelTensorShape get_output_shape(Pool2DAttrs const &attrs,
   } else if (input.at(ff_dim_t(2)).degree == 1 &&
              input.at(ff_dim_t(3)).degree > 1) {
     // case 3: input [N, C, H, W/X] output [N, C, 1, 1], degree is X
-    // input [N, C, H, W/X] output [N, C, output_height, output_width/x], degree is X
+    // input [N, C, H, W/X] output [N, C, output_height, output_width/x], degree
+    // is X
     output_shape.at(ff_dim_t(2)).degree = 1;
     output_shape.at(ff_dim_t(2)).is_replica_dim = false;
     output_shape.at(ff_dim_t(3)).degree = input.at(ff_dim_t(3)).degree;
@@ -106,7 +109,8 @@ ParallelTensorShape get_output_shape(Pool2DAttrs const &attrs,
   } else if (input.at(ff_dim_t(2)).degree > 1 &&
              input.at(ff_dim_t(3)).degree > 1) {
     // case 4: input [N, C, H/X, W/Y] output [N, C, 1, 1], degree is X and Y for
-    // avgpool2d input [N, C, H/X, W/Y] output [N, C, output_height/x, output_width/y], degree is X and Y for maxpool2d
+    // avgpool2d input [N, C, H/X, W/Y] output [N, C, output_height/x,
+    // output_width/y], degree is X and Y for maxpool2d
     for (int i = 2; i < input.num_dims(); i++) {
       output_shape.at(ff_dim_t(i)).is_replica_dim = true;
       output_shape.at(ff_dim_t(i)).degree = input.at(ff_dim_t(i)).degree;
diff --git a/lib/op-attrs/src/repartition.cc b/lib/op-attrs/src/repartition.cc
index 292d90d2e2..b95a400ad5 100644
--- a/lib/op-attrs/src/repartition.cc
+++ b/lib/op-attrs/src/repartition.cc
@@ -4,12 +4,14 @@
 
 namespace FlexFlow {
 
-// this may be wrong partition by n multiplies degree by n and keeps shape the  same
+// this may be wrong partition by n multiplies degree by n and keeps shape the
+// same
 ParallelTensorShape get_output_shape(RepartitionAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   ParallelDim dim = input.at(attrs.repartition_dim);
-  if(dim.size % attrs.repartition_degree * dim.degree != 0) {
-    throw mk_runtime_error("RepartitionAttrs: input.at(attrs.repartition_dim) % attrs.repartition_degree * dim.degree != 0");
+  if (dim.size % attrs.repartition_degree * dim.degree != 0) {
+    throw mk_runtime_error("RepartitionAttrs: input.at(attrs.repartition_dim) "
+                           "% attrs.repartition_degree * dim.degree != 0");
   }
   ParallelTensorShape output(input.dims, input.data_type);
   output.at(attrs.repartition_dim).degree *= attrs.repartition_degree;
diff --git a/lib/op-attrs/src/reshape.cc b/lib/op-attrs/src/reshape.cc
index c715c8dada..b7e887002a 100644
--- a/lib/op-attrs/src/reshape.cc
+++ b/lib/op-attrs/src/reshape.cc
@@ -4,10 +4,11 @@
 
 namespace FlexFlow {
 
-//https://pytorch.org/docs/stable/generated/torch.reshape.html
-// pytorch: the input: [2,3,4], shape maybe [-1,6]， should we add this? and the
-// output is [4, 6] currently we doesn't consider the case of -1,we can support
-// this later the input:[2,3,4], attrs.shape:[4,6], the output is [4, 6]
+// https://pytorch.org/docs/stable/generated/torch.reshape.html
+//  pytorch: the input: [2,3,4], shape maybe [-1,6]， should we add this? and
+//  the output is [4, 6] currently we doesn't consider the case of -1,we can
+//  support this later the input:[2,3,4], attrs.shape:[4,6], the output is [4,
+//  6]
 ParallelTensorShape get_output_shape(ReshapeAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   std::size_t input_volume = input.dims.get_volume();
@@ -15,30 +16,31 @@ ParallelTensorShape get_output_shape(ReshapeAttrs const &attrs,
   for (int i = 0; i < attrs.shape.dims.num_dims(); i++) {
     attrs_volume *= attrs.shape.at(ff_dim_t(i));
   }
-  if(input_volume != attrs_volume) {
+  if (input_volume != attrs_volume) {
     throw mk_runtime_error("ReshapeAttrs: input_volume != attrs_volume");
   }
 
   ParallelTensorShape output = input;
   output.data_type = input.data_type;
-  if(attrs.shape.dims.num_dims() == 1) {
-      //infer the shape
-      if(attrs.shape.at(ff_dim_t(0)) == -1) {
-       
-        output.at(ff_dim_t(0)).size = input_volume ;
-        output.at(ff_dim_t(0)).degree = 1;
-        output.at(ff_dim_t(0)).is_replica_dim = false;
-      } else {
-        output.at(ff_dim_t(0)).size = attrs.shape.at(ff_dim_t(0));
-        output.at(ff_dim_t(1)).size = input_volume / attrs.shape.at(ff_dim_t(0));
-        for(int i = 0; i < 2; i++) {
-          output.at(ff_dim_t(i)).degree = 1;
-          output.at(ff_dim_t(i)).is_replica_dim = false;
-        }
+  if (attrs.shape.dims.num_dims() == 1) {
+    // infer the shape
+    if (attrs.shape.at(ff_dim_t(0)) == -1) {
+
+      output.at(ff_dim_t(0)).size = input_volume;
+      output.at(ff_dim_t(0)).degree = 1;
+      output.at(ff_dim_t(0)).is_replica_dim = false;
+    } else {
+      output.at(ff_dim_t(0)).size = attrs.shape.at(ff_dim_t(0));
+      output.at(ff_dim_t(1)).size = input_volume / attrs.shape.at(ff_dim_t(0));
+      for (int i = 0; i < 2; i++) {
+        output.at(ff_dim_t(i)).degree = 1;
+        output.at(ff_dim_t(i)).is_replica_dim = false;
       }
+    }
   } else {
-      ParallelTensorDims dims{attrs.shape.dims};
-      output = {dims, input.data_type};
+    ParallelTensorDims dims{attrs.shape.dims};
+    output = {dims, input.data_type};
+    // Note: I think reshape doesn't need to consider the degree
   }
   return output;
 }
diff --git a/lib/op-attrs/src/reverse.cc b/lib/op-attrs/src/reverse.cc
index a09d43ae61..644e733a0a 100644
--- a/lib/op-attrs/src/reverse.cc
+++ b/lib/op-attrs/src/reverse.cc
@@ -1,21 +1,16 @@
 #include "op-attrs/ops/reverse.h"
 #include "op-attrs/ff_dim.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-bool ReverseAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (input.is_valid() == false) {
-    return false;
-  }
-  if (this->axis < 0 || this->axis >= input.num_dims()) {
-    return false;
-  }
-  return true;
-}
-
 ParallelTensorShape get_output_shape(ReverseAttrs const &attrs,
                                      ParallelTensorShape const &input) {
+  if (attrs.axis < 0 || attrs.axis >= input.num_dims()) {
+    throw mk_runtime_error("ReverseAttrs: axis is invalid");
+  }
   ParallelTensorShape output = input;
+  // output degree is same as input degree, because it's just reverse operation
   return output;
 }
 
diff --git a/lib/op-attrs/src/softmax.cc b/lib/op-attrs/src/softmax.cc
index 91d6555681..1d9cd5fcc0 100644
--- a/lib/op-attrs/src/softmax.cc
+++ b/lib/op-attrs/src/softmax.cc
@@ -1,20 +1,13 @@
 #include "op-attrs/ops/softmax.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-bool SoftmaxAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  if (input.num_dims() < 2) {
-    return false;
-  }
-  return true;
-}
-
 ParallelTensorShape get_output_shape(SoftmaxAttrs const &attrs,
                                      ParallelTensorShape const &input) {
-  assert(attrs.is_valid(input));
+  if (input.num_dims() < 2) {
+    throw mk_runtime_error("SoftmaxAttrs: input.num_dims() < 2");
+  }
   ParallelTensorShape output = input;
   return output;
 }
diff --git a/lib/op-attrs/src/split.cc b/lib/op-attrs/src/split.cc
index 1c14f1c370..5c6f3d6924 100644
--- a/lib/op-attrs/src/split.cc
+++ b/lib/op-attrs/src/split.cc
@@ -1,33 +1,28 @@
 #include "op-attrs/ops/split.h"
 #include "op-attrs/ff_dim.h"
+#include "utils/containers.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-bool SplitAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  std::size_t dims_sum = 0;
-
-  for (std::size_t i = 0; i < this->splits.size(); ++i) {
-    dims_sum += splits[i];
-  }
-
-  if (dims_sum != input.at(ff_dim_t(axis)).size) {
-    return false;
-  }
-  return true;
-}
-
 std::vector<ParallelTensorShape>
     get_output_shapes(SplitAttrs const &attrs,
                       ParallelTensorShape const &input) {
 
-  assert(attrs.is_valid(input));
+  std::size_t dims_sum = sum(attrs.splits);
+  if (dims_sum != input.at(ff_dim_t(attrs.axis)).size) {
+    throw mk_runtime_error(
+        "SplitAttrs: dims_sum != input.at(ff_dim_t(attrs.axis)).size");
+  }
+
   std::vector<ParallelTensorShape> outputs;
   for (std::size_t i = 0; i < attrs.splits.size(); ++i) {
     outputs.emplace_back(input);
     outputs.back().at(ff_dim_t(attrs.axis)).size = attrs.splits[i];
+    outputs.back().at(ff_dim_t(attrs.axis)).degree =
+        input.at(ff_dim_t(attrs.axis)).degree;
+    outputs.back().at(ff_dim_t(attrs.axis)).is_replica_dim =
+        input.at(ff_dim_t(attrs.axis)).degree > 1;
   }
   return outputs;
 }
diff --git a/lib/op-attrs/src/topk.cc b/lib/op-attrs/src/topk.cc
index 06c43b3eba..73bf59b048 100644
--- a/lib/op-attrs/src/topk.cc
+++ b/lib/op-attrs/src/topk.cc
@@ -1,23 +1,22 @@
 #include "op-attrs/ops/topk.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-bool TopKAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
+ParallelTensorShape get_output_shape(TopKAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
 
-  if (k > input.at(ff_dim_t(axis)).size) {
-    return false;
+  if (attrs.k > input.at(ff_dim_t(attrs.axis)).size) {
+    throw mk_runtime_error(
+        "TopKAttrs: k > input.at(ff_dim_t(attrs.axis)).size");
   }
-  return true;
-}
 
-ParallelTensorShape get_output_shape(TopKAttrs const &attrs,
-                                     ParallelTensorShape const &input) {
-  assert(attrs.is_valid(input));
   ParallelTensorShape output = input;
   output.at(ff_dim_t(attrs.axis)).size = attrs.k;
+  output.at(ff_dim_t(attrs.axis)).degree =
+      input.at(ff_dim_t(attrs.axis)).degree;
+  output.at(ff_dim_t(attrs.axis)).is_replica_dim =
+      input.at(ff_dim_t(attrs.axis)).degree > 1;
   return output;
 }
 
diff --git a/lib/op-attrs/src/transpose.cc b/lib/op-attrs/src/transpose.cc
index 97140c6b49..88772b72e0 100644
--- a/lib/op-attrs/src/transpose.cc
+++ b/lib/op-attrs/src/transpose.cc
@@ -1,38 +1,35 @@
 #include "op-attrs/ops/transpose.h"
 #include "op-attrs/ff_dim.h"
-#include "utils/exception.decl.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-bool TransposeAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  // in pytorch, we use choose two dim for transpose, so I think the size of
-  // perm should be 2
-  if (perm.size() != 2) {
-    return false;
+// assume we have [x, y, z, l], perms is [0,2] we return [z, y, x, l]
+ParallelTensorShape get_output_shape(TransposeAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  if (attrs.perm.size() != 2) {
+    throw mk_runtime_error("TransposeAttrs: perm.size() != 2");
   }
 
-  auto dim0 = perm[0];
-  auto dim1 = perm[1];
+  auto dim0 = attrs.perm[0];
+  auto dim1 = attrs.perm[1];
   if (dim0 < 0 || dim1 < 0 || dim0 >= input.num_dims() ||
       dim1 >= input.num_dims()) {
-    return false;
+    throw mk_runtime_error("TransposeAttrs: dim0 < 0 || dim1 < 0 || dim0 >= "
+                           "input.num_dims() || dim1 >= input.num_dims()");
   }
 
-  return true;
-}
-
-// assume we have [x, y, z, l], perms is [0,2] we return [z, y, x, l]
-ParallelTensorShape get_output_shape(TransposeAttrs const &attrs,
-                                     ParallelTensorShape const &input) {
   ParallelTensorShape output = input;
-  auto dim0 = attrs.perm[0];
-  auto dim1 = attrs.perm[1];
   int temp = input.at(ff_dim_t(dim0)).size;
+  int degree = input.at(ff_dim_t(dim0)).degree;
   output.at(ff_dim_t(dim0)).size = input.at(ff_dim_t(dim1)).size;
   output.at(ff_dim_t(dim1)).size = temp;
+  output.at(ff_dim_t(dim0)).degree = input.at(ff_dim_t(dim1)).degree;
+  output.at(ff_dim_t(dim1)).degree = degree;
+  output.at(ff_dim_t(dim0)).is_replica_dim =
+      output.at(ff_dim_t(dim0)).degree > 1;
+  output.at(ff_dim_t(dim1)).is_replica_dim =
+      output.at(ff_dim_t(dim1)).degree > 1;
   return output;
 }
 

From 50ec41f0b204dd5fc4b2afc91c6ec5bce8edb88f Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 18 Oct 2023 15:46:12 +0000
Subject: [PATCH 52/69] try to implement the attention

---
 lib/op-attrs/src/attention.cc | 78 ++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 24 deletions(-)

diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index 2d189d7472..bee1606d59 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -1,8 +1,6 @@
 #include "op-attrs/ops/attention.h"
-#include "kernels/legion_dim.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exception.decl.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -88,37 +86,69 @@ TensorShape
 
 // according to the pytorch
 // https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html,
-// query: [target_size_seq_len, batch_size, embed_dim], we consider the batch
-// size key: (seq_len, batch_size, embed_dim) value: (seq_len, batch_size,
-// embed_dim)
+// we consider the batch size
+// query: [seq_len, batch_size, embed_dim],
+// key: (seq_len, batch_size, embed_dim)
+// value: (seq_len, batch_size,embed_dim)
 //  multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
-// output: (target_size_seq_len, batch_size, embed_dim)
+// output: (seq_len, batch_size, embed_dim)
 
 ParallelTensorShape get_output_shape(
     MultiHeadAttentionAttrs const &attrs,
-    MultiHeadAttentionInputs<ParallelTensorShape> const &inputs) {
-  ParallelTensorShape output_shape = inputs.query;
-  NOT_IMPLEMENTED();
-}
-
-bool is_valid(MultiHeadAttentionAttrs const &attrs,
-              MultiHeadAttentionInputs<ParallelTensorShape> const &input) {
-  bool valid = true;
+    MultiHeadAttentionInputs<ParallelTensorShape> const &input) {
   if (input.query.num_dims() != 3 || input.key.num_dims() != 3 ||
       input.value.num_dims() != 3) {
-    return false;
+    throw mk_runtime_error("MultiHeadAttentionAttrs: num_dims != 3");
+  }
+
+  if (input.query.at(ff_dim_t(0)).size != input.key.at(ff_dim_t(0)).size ||
+      input.query.at(ff_dim_t(0)).size != input.value.at(ff_dim_t(0)).size ||
+      input.key.at(ff_dim_t(0)).size != input.value.at(ff_dim_t(0)).size) {
+    throw mk_runtime_error("MultiHeadAttentionAttrs: seq_len not match");
+  }
+
+  if (input.query.at(ff_dim_t(1)).size != input.key.at(ff_dim_t(1)).size ||
+      input.query.at(ff_dim_t(1)).size != input.value.at(ff_dim_t(1)).size ||
+      input.key.at(ff_dim_t(1)).size != input.value.at(ff_dim_t(1)).size) {
+    throw mk_runtime_error("MultiHeadAttentionAttrs: batch_size not match");
   }
-  // ff_dim_t = num_dims - legion_dim_t - 1
-  if (input.query.at(legion_dim_t(0)).size != attrs.embed_dim) {
-    return false;
+
+  if (input.query.at(ff_dim_t(2)).size != input.key.at(ff_dim_t(2)).size ||
+      input.query.at(ff_dim_t(2)).size != input.value.at(ff_dim_t(2)).size ||
+      input.key.at(ff_dim_t(2)).size != input.value.at(ff_dim_t(2)).size) {
+    throw mk_runtime_error("MultiHeadAttentionAttrs:  embed_dim not match");
+  }
+
+  if (input.query.at(ff_dim_t(2)).size != attrs.embed_dim ||
+      input.key.at(ff_dim_t(2)).size != attrs.embed_dim ||
+      input.value.at(ff_dim_t(2)).size != attrs.embed_dim) {
+    throw mk_runtime_error(
+        "MultiHeadAttentionAttrs:  input's embed_dim not match to attrs");
   }
-  if (input.key.at(legion_dim_t(0)).size != attrs.embed_dim) {
-    return false;
+
+  if (attrs.embed_dim != (attrs.num_heads * attrs.kdim)) {
+    throw mk_runtime_error(
+        "MultiHeadAttentionAttrs:  embed_dim not match to num_heads * kdim");
   }
-  if (input.value.at(legion_dim_t(0)).size != attrs.embed_dim) {
-    return false;
+
+  // TODO: how to deal with the degree
+  // q = wq*x , k = wk*x, v = wv*x  (seq_len, batch_size, embed_dim)
+  // k->(seq_len, num_head, batch_size, kdim)
+  // v->(seq_len, num_head, batch_size, vdim)
+  // q->(seq_len, num_head, batch_size, kdim)
+  // attn = q @k  (seq_len, num_head, batch_size, batch_size)
+  // attn = attn @v (seq_len, num_head, batch_size, vdim)
+  // attn = attn.transpose(1,2) (seq_len, batch_size, num_head, vdim)
+  // attn = attn.reshape(seq_len, batch_size, num_head*vdim)
+
+  // Note: we support tensor parallelism for seq_len/batch_size/embed_dim
+  ParallelTensorShape output = input.query;
+  for (int i = 0; i < output.num_dims(); i++) {
+    output.at(ff_dim_t(i)).degree = input.query.at(ff_dim_t(i)).degree;
+    output.at(ff_dim_t(i)).is_replica_dim =
+        input.query.at(ff_dim_t(i)).degree > 1;
   }
-  return true;
+  return output;
 }
 
 } // namespace FlexFlow

From 2e2533adcf282b62daba19515299c7356048ef9b Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 18 Oct 2023 15:57:37 +0000
Subject: [PATCH 53/69] leave reduce

---
 lib/op-attrs/src/reduce.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/op-attrs/src/reduce.cc b/lib/op-attrs/src/reduce.cc
index 3deb33e680..727e41e0ec 100644
--- a/lib/op-attrs/src/reduce.cc
+++ b/lib/op-attrs/src/reduce.cc
@@ -3,12 +3,11 @@
 
 namespace FlexFlow {
 
-bool ReduceAttrs::is_valid(ParallelTensorShape const &input) const {
-    NOT_IMPLEMENTED()}
-
 ParallelTensorShape get_output_shape(ReduceAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   NOT_IMPLEMENTED()
+  // reduce is sum/max/min/mean
+  // NOTE: how to implement this
 }
 
 } // namespace FlexFlow

From aff7b006300b658db3b90b4cd9010baf55992a0e Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 18 Oct 2023 21:09:38 +0000
Subject: [PATCH 54/69] fix some error

---
 lib/op-attrs/include/op-attrs/ops/attention.h |  3 --
 .../include/op-attrs/ops/element_binary.h     |  1 -
 lib/op-attrs/include/op-attrs/ops/replicate.h |  1 -
 lib/op-attrs/src/attention.cc                 | 29 -------------------
 lib/op-attrs/src/batch_matmul.cc              |  1 -
 lib/op-attrs/src/batch_norm.cc                |  7 ++---
 lib/op-attrs/src/cast.cc                      |  8 ++---
 lib/op-attrs/src/combine.cc                   | 14 +++++----
 lib/op-attrs/src/concat.cc                    |  1 -
 lib/op-attrs/src/dropout.cc                   |  6 ++--
 lib/op-attrs/src/element_binary.cc            |  7 +++--
 lib/op-attrs/src/element_unary.cc             |  6 ++--
 lib/op-attrs/src/embedding.cc                 |  1 -
 lib/op-attrs/src/groupby.cc                   |  7 ++---
 lib/op-attrs/src/layer_norm.cc                |  6 ++--
 lib/op-attrs/src/reduce.cc                    |  2 +-
 lib/op-attrs/src/replicate.cc                 | 18 ++++--------
 lib/op-attrs/src/reverse.cc                   |  8 ++---
 lib/op-attrs/src/softmax.cc                   | 10 +++----
 19 files changed, 47 insertions(+), 89 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h
index 7d332ddc0e..a80c579689 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention.h
+++ b/lib/op-attrs/include/op-attrs/ops/attention.h
@@ -44,9 +44,6 @@ struct MultiHeadAttentionInputs
   TensorType value;
 };
 
-bool is_valid(MultiHeadAttentionAttrs const &,
-              MultiHeadAttentionInputs<ParallelTensorShape> const &input);
-
 int get_qProjSize(MultiHeadAttentionAttrs const &);
 int get_vProjSize(MultiHeadAttentionAttrs const &);
 int get_kProjSize(MultiHeadAttentionAttrs const &);
diff --git a/lib/op-attrs/include/op-attrs/ops/element_binary.h b/lib/op-attrs/include/op-attrs/ops/element_binary.h
index 9a2e4dc22a..377a03970a 100644
--- a/lib/op-attrs/include/op-attrs/ops/element_binary.h
+++ b/lib/op-attrs/include/op-attrs/ops/element_binary.h
@@ -4,7 +4,6 @@
 #include "core.h"
 #include "op-attrs/op.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "op-attrs/©"
 #include "utils/visitable.h"
 
 namespace FlexFlow {
diff --git a/lib/op-attrs/include/op-attrs/ops/replicate.h b/lib/op-attrs/include/op-attrs/ops/replicate.h
index c2a9b6abf0..2bbcad9d95 100644
--- a/lib/op-attrs/include/op-attrs/ops/replicate.h
+++ b/lib/op-attrs/include/op-attrs/ops/replicate.h
@@ -11,7 +11,6 @@ namespace FlexFlow {
 struct ReplicateAttrs {
   ff_dim_t replicate_dim;
   req<int> replicate_degree;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(ReplicateAttrs, replicate_dim, replicate_degree);
 CHECK_VALID_OP_ATTR(ReplicateAttrs);
diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index bee1606d59..61a5c79b0a 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -54,35 +54,6 @@ TensorShape
 
   return {dims, DataType::FLOAT};
 }
-// these two functions are not defined in the attention.h
-//  ParallelTensorShape get_output_shape(MultiHeadAttentionAttrs const &attrs,
-//                                       ParallelTensorShape const &query_shape,
-//                                       ParallelTensorShape const &key_shape,
-//                                       ParallelTensorShape const &value_shape)
-//                                       {
-//    /* ParallelDim replica_dim =
-//    query_shape.at(ff_dim_t(query_shape.num_dims() -
-//     * 2)); */
-//    /* replica_dim.size = replica_dim.degree; */
-
-//   /* ParallelDim */
-
-//   ParallelTensorShape output_shape = query_shape;
-//   output_shape.at(ff_dim_t(output_shape.num_dims() - 1)).size =
-//   attrs.embed_dim; return output_shape;
-// }
-
-// TensorShape get_output_shape(MultiHeadAttentionAttrs const &attrs,
-//                              TensorShape const &query_shape,
-//                              TensorShape const &key_shape,
-//                              TensorShape const &value_shape) {
-//   ParallelTensorShape parallel_shape =
-//       get_output_shape(attrs,
-//                        static_cast<ParallelTensorShape>(query_shape),
-//                        static_cast<ParallelTensorShape>(key_shape),
-//                        static_cast<ParallelTensorShape>(value_shape));
-//   return get_tensor_shape_unsafe(parallel_shape);
-// }
 
 // according to the pytorch
 // https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html,
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index deabdcea5b..e170e29053 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -4,7 +4,6 @@
 #include "utils/exception.h"
 
 namespace FlexFlow {
-
 // how to get the batch size? and lhs: [b, n, m], rhs: [b, m, p]
 // output: [b, n, p] //n == s1, m == s2
 //[b, n/2, m], [b, m, p/2] -> [b, n/2, p/2]
diff --git a/lib/op-attrs/src/batch_norm.cc b/lib/op-attrs/src/batch_norm.cc
index 2f050814c9..2557c9def5 100644
--- a/lib/op-attrs/src/batch_norm.cc
+++ b/lib/op-attrs/src/batch_norm.cc
@@ -2,18 +2,17 @@
 #include "utils/exception.h"
 
 namespace FlexFlow {
-
 // input: [b, c, h, w]
 // output: [b, c, h, w]
 ParallelTensorShape get_output_shape(BatchNormAttrs const &attrs,
-                                     ParallelTensorShape const &input) {
+                                     ParallelTensorShape const &input_shape) {
   if (!input.is_valid() || input.num_dims() != 4) {
     throw mk_runtime_error(
         "BatchNormAttrs::get_output_shape: input is invalid");
   }
-  ParallelTensorShape output_shape = input;
+
   // the degree of the output is the same as the input
-  return output_shape;
+  return input_shape;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/cast.cc b/lib/op-attrs/src/cast.cc
index 7c679439ad..868756abcd 100644
--- a/lib/op-attrs/src/cast.cc
+++ b/lib/op-attrs/src/cast.cc
@@ -4,13 +4,13 @@
 namespace FlexFlow {
 
 ParallelTensorShape get_output_shape(CastAttrs const &attrs,
-                                     ParallelTensorShape const &input) {
+                                     ParallelTensorShape const &input_shape) {
   if (!input.is_valid()) {
     throw mk_runtime_error("CastAttrs::get_output_shape: input is invalid");
   }
-  ParallelTensorShape output = input;
-  output.data_type = attrs.dtype;
-  return output;
+  ParallelTensorShape output_shape = input_shape;
+  output_shape.data_type = attrs.dtype;
+  return output_shape;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/combine.cc b/lib/op-attrs/src/combine.cc
index 48fc6c8720..d851c186c8 100644
--- a/lib/op-attrs/src/combine.cc
+++ b/lib/op-attrs/src/combine.cc
@@ -2,12 +2,14 @@
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
-
-ParallelTensorShape get_output_shape(CombineAttrs const &attrs,
-                                     ParallelTensorShape const &input) {
-  ParallelTensorShape output = input;
-  output.at(attrs.combine_dim).degree /= attrs.combine_degree;
-  return output;
+ParallelTensorShape
+    get_output_shape_shape(CombineAttrs const &attrs,
+                           ParallelTensorShape const &input_shape) {
+  ParallelTensorShape output_shape = input_shape;
+  output_shape.at(attrs.combine_dim).degree /= attrs.combine_degree;
+  output_shape.at(attrs.combine_dim).is_replica_dim =
+      output_shape.at(attrs.combine_dim).degree > 1;
+  return output_shape;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/concat.cc b/lib/op-attrs/src/concat.cc
index 5c9dc3e370..47807b9c1a 100644
--- a/lib/op-attrs/src/concat.cc
+++ b/lib/op-attrs/src/concat.cc
@@ -2,7 +2,6 @@
 #include "utils/exception.h"
 
 namespace FlexFlow {
-
 ParallelTensorShape
     get_output_shape(ConcatAttrs const &attrs,
                      std::vector<ParallelTensorShape> const &inputs) {
diff --git a/lib/op-attrs/src/dropout.cc b/lib/op-attrs/src/dropout.cc
index ab763b8a7f..5d5f78ef25 100644
--- a/lib/op-attrs/src/dropout.cc
+++ b/lib/op-attrs/src/dropout.cc
@@ -4,9 +4,9 @@
 namespace FlexFlow {
 
 ParallelTensorShape get_output_shape(DropoutAttrs const &attrs,
-                                     ParallelTensorShape const &input) {
-  ParallelTensorShape output = input;
-  return output;
+                                     ParallelTensorShape const &input_shape) {
+
+  return input_shape;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/element_binary.cc b/lib/op-attrs/src/element_binary.cc
index c61be195c0..77ddc016b8 100644
--- a/lib/op-attrs/src/element_binary.cc
+++ b/lib/op-attrs/src/element_binary.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/element_binary.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -18,8 +19,10 @@ ParallelTensorShape get_output_shape(ElementBinaryAttrs const &atts,
     } else if (rhs.at(ff_dim_t(i)).size == 1) {
       output.at(ff_dim_t(i)) = lhs.at(ff_dim_t(i));
     } else {
-      assert(false && "Operands could not be broadcast together");
-      exit(0);
+      throw mk_runtime_error(
+          "Operands of shapes {} and {} could not be broadcast together",
+          lhs,
+          rhs);
     }
   }
 
diff --git a/lib/op-attrs/src/element_unary.cc b/lib/op-attrs/src/element_unary.cc
index b9028ac3b8..109653c5c5 100644
--- a/lib/op-attrs/src/element_unary.cc
+++ b/lib/op-attrs/src/element_unary.cc
@@ -3,9 +3,9 @@
 namespace FlexFlow {
 
 ParallelTensorShape get_output_shape(ElementUnaryAttrs const &atts,
-                                     ParallelTensorShape const &input) {
-  ParallelTensorShape output = input;
-  return output;
+                                     ParallelTensorShape const &input_shape) {
+
+  return input_shape;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/embedding.cc b/lib/op-attrs/src/embedding.cc
index 5e86335f14..3aba747036 100644
--- a/lib/op-attrs/src/embedding.cc
+++ b/lib/op-attrs/src/embedding.cc
@@ -9,7 +9,6 @@ namespace FlexFlow {
 ParallelTensorShape get_output_shape(EmbeddingAttrs const &atts,
                                      ParallelTensorShape const &input) {
   ParallelTensorShape output = input;
-  output.at(ff_dim_t(1)).size = input.at(ff_dim_t(1)).size;
   output.at(ff_dim_t(2)).size = atts.out_channels;
   // output degree is same as input degree
   return output;
diff --git a/lib/op-attrs/src/groupby.cc b/lib/op-attrs/src/groupby.cc
index 09babdb20d..6c1b201993 100644
--- a/lib/op-attrs/src/groupby.cc
+++ b/lib/op-attrs/src/groupby.cc
@@ -19,16 +19,15 @@ in enumerate(grouped_data): print(f"Group {i}: {group}")
 */
 
 ParallelTensorShape get_output_shape(Group_byAttrs const &attrs,
-                                     ParallelTensorShape const &input,
+                                     ParallelTensorShape const &input_shape,
                                      ParallelTensorShape const &index) {
-  if (input.num_dims() != index.num_dims()) {
+  if (input_shape.num_dims() != index.num_dims()) {
     throw mk_runtime_error(
         "Group_by: input and index must have the same number of dimensions");
   }
 
-  ParallelTensorShape output = input;
   // degree of output is same as input's
-  return output;
+  return input_shape;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/layer_norm.cc b/lib/op-attrs/src/layer_norm.cc
index 58160b528f..737e527647 100644
--- a/lib/op-attrs/src/layer_norm.cc
+++ b/lib/op-attrs/src/layer_norm.cc
@@ -5,13 +5,13 @@ namespace FlexFlow {
 
 // todo: maybe we need to set the degree of parallel_dim
 ParallelTensorShape get_output_shape(LayerNormAttrs const &attrs,
-                                     ParallelTensorShape const &input) {
+                                     ParallelTensorShape const &input_shape) {
   if (input.num_dims() < 2) {
     throw mk_runtime_error("LayerNorm: input must have at least 2 dimensions");
   }
-  ParallelTensorShape output = input;
+
   // output degree is same as input degree
-  return output;
+  return input_shape;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/reduce.cc b/lib/op-attrs/src/reduce.cc
index 727e41e0ec..a6c78db75b 100644
--- a/lib/op-attrs/src/reduce.cc
+++ b/lib/op-attrs/src/reduce.cc
@@ -6,7 +6,7 @@ namespace FlexFlow {
 ParallelTensorShape get_output_shape(ReduceAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   NOT_IMPLEMENTED()
-  // reduce is sum/max/min/mean
+  // reduce is sum/max/min/mean, I think we just return 1D tensor
   // NOTE: how to implement this
 }
 
diff --git a/lib/op-attrs/src/replicate.cc b/lib/op-attrs/src/replicate.cc
index 2086ab41bd..d059bd387b 100644
--- a/lib/op-attrs/src/replicate.cc
+++ b/lib/op-attrs/src/replicate.cc
@@ -1,20 +1,9 @@
 #include "op-attrs/ops/replicate.h"
 #include "op-attrs/parallel_dim.h"
-#include "utils/exception.decl.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-bool ReplicateAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  if (this->replicate_dim >= input.num_dims() || this->replicate_degree <= 0) {
-    return false;
-  }
-
-  return true;
-}
-
 // replicate by n multiplies degree by n and shape by n
 // seems it is like pytorch's repeat
 // original_tensor = torch.tensor([1, 2, 3]) torch.Size([3])
@@ -25,7 +14,10 @@ bool ReplicateAttrs::is_valid(ParallelTensorShape const &input) const {
 
 ParallelTensorShape get_output_shape(ReplicateAttrs const &attrs,
                                      ParallelTensorShape const &input) {
-  assert(attrs.is_valid(input));
+  if (attrs.replicate_dim >= input.num_dims() || attrs.replicate_degree <= 0) {
+    throw mk_runtime_error("ReplicateAttrs::get_output_shape: axis is out of "
+                           "range or input is invalid");
+  }
   ParallelTensorShape output = input;
   output.at(attrs.replicate_dim).size *= attrs.replicate_degree;
   return output;
diff --git a/lib/op-attrs/src/reverse.cc b/lib/op-attrs/src/reverse.cc
index 644e733a0a..f418495acf 100644
--- a/lib/op-attrs/src/reverse.cc
+++ b/lib/op-attrs/src/reverse.cc
@@ -5,13 +5,13 @@
 namespace FlexFlow {
 
 ParallelTensorShape get_output_shape(ReverseAttrs const &attrs,
-                                     ParallelTensorShape const &input) {
-  if (attrs.axis < 0 || attrs.axis >= input.num_dims()) {
+                                     ParallelTensorShape const &input_shape) {
+  if (attrs.axis < 0 || attrs.axis >= input_shape.num_dims()) {
     throw mk_runtime_error("ReverseAttrs: axis is invalid");
   }
-  ParallelTensorShape output = input;
+
   // output degree is same as input degree, because it's just reverse operation
-  return output;
+  return input_shape;
 }
 
 }; // namespace FlexFlow
diff --git a/lib/op-attrs/src/softmax.cc b/lib/op-attrs/src/softmax.cc
index 1d9cd5fcc0..d9ab0c9a84 100644
--- a/lib/op-attrs/src/softmax.cc
+++ b/lib/op-attrs/src/softmax.cc
@@ -4,12 +4,12 @@
 namespace FlexFlow {
 
 ParallelTensorShape get_output_shape(SoftmaxAttrs const &attrs,
-                                     ParallelTensorShape const &input) {
-  if (input.num_dims() < 2) {
-    throw mk_runtime_error("SoftmaxAttrs: input.num_dims() < 2");
+                                     ParallelTensorShape const &input_shape) {
+  if (input_shape.num_dims() < 2) {
+    throw mk_runtime_error("SoftmaxAttrs: input_shape.num_dims() < 2");
   }
-  ParallelTensorShape output = input;
-  return output;
+
+  return input_shape;
 }
 
 } // namespace FlexFlow

From e59975eb2ad1bd253d9817c8100c126d531a6cb4 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 18 Oct 2023 21:17:23 +0000
Subject: [PATCH 55/69] remove the empty blank

---
 lib/op-attrs/src/batch_norm.cc    | 8 ++++----
 lib/op-attrs/src/broadcast.cc     | 5 ++++-
 lib/op-attrs/src/cast.cc          | 5 +++--
 lib/op-attrs/src/dropout.cc       | 1 -
 lib/op-attrs/src/element_unary.cc | 1 -
 lib/op-attrs/src/groupby.cc       | 1 -
 lib/op-attrs/src/layer_norm.cc    | 1 -
 lib/op-attrs/src/pool_2d.cc       | 2 --
 lib/op-attrs/src/repartition.cc   | 2 +-
 lib/op-attrs/src/replicate.cc     | 2 ++
 lib/op-attrs/src/reverse.cc       | 1 -
 lib/op-attrs/src/softmax.cc       | 1 -
 12 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/lib/op-attrs/src/batch_norm.cc b/lib/op-attrs/src/batch_norm.cc
index 2557c9def5..5e22c8147d 100644
--- a/lib/op-attrs/src/batch_norm.cc
+++ b/lib/op-attrs/src/batch_norm.cc
@@ -2,16 +2,16 @@
 #include "utils/exception.h"
 
 namespace FlexFlow {
-// input: [b, c, h, w]
+// input_shape: [b, c, h, w]
 // output: [b, c, h, w]
 ParallelTensorShape get_output_shape(BatchNormAttrs const &attrs,
                                      ParallelTensorShape const &input_shape) {
-  if (!input.is_valid() || input.num_dims() != 4) {
+  if (!input_shape.is_valid() || input_shape.num_dims() != 4) {
     throw mk_runtime_error(
-        "BatchNormAttrs::get_output_shape: input is invalid");
+        "BatchNormAttrs::get_output_shape: input_shape is invalid");
   }
 
-  // the degree of the output is the same as the input
+  // the degree of the output is the same as the input_shape
   return input_shape;
 }
 
diff --git a/lib/op-attrs/src/broadcast.cc b/lib/op-attrs/src/broadcast.cc
index c69f480b84..f0de4cc807 100644
--- a/lib/op-attrs/src/broadcast.cc
+++ b/lib/op-attrs/src/broadcast.cc
@@ -1,3 +1,6 @@
 #include "op-attrs/ops/broadcast.h"
 
-namespace FlexFlow {} // namespace FlexFlow
+namespace FlexFlow {
+
+// what's the definition of broadcast for get_output_shape
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/cast.cc b/lib/op-attrs/src/cast.cc
index 868756abcd..60f899fed2 100644
--- a/lib/op-attrs/src/cast.cc
+++ b/lib/op-attrs/src/cast.cc
@@ -5,8 +5,9 @@ namespace FlexFlow {
 
 ParallelTensorShape get_output_shape(CastAttrs const &attrs,
                                      ParallelTensorShape const &input_shape) {
-  if (!input.is_valid()) {
-    throw mk_runtime_error("CastAttrs::get_output_shape: input is invalid");
+  if (!input_shape.is_valid()) {
+    throw mk_runtime_error(
+        "CastAttrs::get_output_shape: input_shape is invalid");
   }
   ParallelTensorShape output_shape = input_shape;
   output_shape.data_type = attrs.dtype;
diff --git a/lib/op-attrs/src/dropout.cc b/lib/op-attrs/src/dropout.cc
index 5d5f78ef25..7bdae67af9 100644
--- a/lib/op-attrs/src/dropout.cc
+++ b/lib/op-attrs/src/dropout.cc
@@ -5,7 +5,6 @@ namespace FlexFlow {
 
 ParallelTensorShape get_output_shape(DropoutAttrs const &attrs,
                                      ParallelTensorShape const &input_shape) {
-
   return input_shape;
 }
 
diff --git a/lib/op-attrs/src/element_unary.cc b/lib/op-attrs/src/element_unary.cc
index 109653c5c5..08622e6f63 100644
--- a/lib/op-attrs/src/element_unary.cc
+++ b/lib/op-attrs/src/element_unary.cc
@@ -4,7 +4,6 @@ namespace FlexFlow {
 
 ParallelTensorShape get_output_shape(ElementUnaryAttrs const &atts,
                                      ParallelTensorShape const &input_shape) {
-
   return input_shape;
 }
 
diff --git a/lib/op-attrs/src/groupby.cc b/lib/op-attrs/src/groupby.cc
index 6c1b201993..64d7797771 100644
--- a/lib/op-attrs/src/groupby.cc
+++ b/lib/op-attrs/src/groupby.cc
@@ -25,7 +25,6 @@ ParallelTensorShape get_output_shape(Group_byAttrs const &attrs,
     throw mk_runtime_error(
         "Group_by: input and index must have the same number of dimensions");
   }
-
   // degree of output is same as input's
   return input_shape;
 }
diff --git a/lib/op-attrs/src/layer_norm.cc b/lib/op-attrs/src/layer_norm.cc
index 737e527647..d98562219f 100644
--- a/lib/op-attrs/src/layer_norm.cc
+++ b/lib/op-attrs/src/layer_norm.cc
@@ -9,7 +9,6 @@ ParallelTensorShape get_output_shape(LayerNormAttrs const &attrs,
   if (input.num_dims() < 2) {
     throw mk_runtime_error("LayerNorm: input must have at least 2 dimensions");
   }
-
   // output degree is same as input degree
   return input_shape;
 }
diff --git a/lib/op-attrs/src/pool_2d.cc b/lib/op-attrs/src/pool_2d.cc
index b3859c6e06..ec6253a0b1 100644
--- a/lib/op-attrs/src/pool_2d.cc
+++ b/lib/op-attrs/src/pool_2d.cc
@@ -122,8 +122,6 @@ ParallelTensorShape get_output_shape(Pool2DAttrs const &attrs,
   return output_shape;
 }
 
-}
-
 /* ParallelTensorShape Pool2DAttrs::calculate_output_shape(ParallelTensorShape
  * const &input) const { */
 /*   return solve_mappings(input).output_shapes.at(0); */
diff --git a/lib/op-attrs/src/repartition.cc b/lib/op-attrs/src/repartition.cc
index b95a400ad5..d7807fcc10 100644
--- a/lib/op-attrs/src/repartition.cc
+++ b/lib/op-attrs/src/repartition.cc
@@ -11,7 +11,7 @@ ParallelTensorShape get_output_shape(RepartitionAttrs const &attrs,
   ParallelDim dim = input.at(attrs.repartition_dim);
   if (dim.size % attrs.repartition_degree * dim.degree != 0) {
     throw mk_runtime_error("RepartitionAttrs: input.at(attrs.repartition_dim) "
-                           "% attrs.repartition_degree * dim.degree != 0");
+                           "attrs.repartition_degree * dim.degree != 0");
   }
   ParallelTensorShape output(input.dims, input.data_type);
   output.at(attrs.repartition_dim).degree *= attrs.repartition_degree;
diff --git a/lib/op-attrs/src/replicate.cc b/lib/op-attrs/src/replicate.cc
index d059bd387b..7a1b511a5e 100644
--- a/lib/op-attrs/src/replicate.cc
+++ b/lib/op-attrs/src/replicate.cc
@@ -20,6 +20,8 @@ ParallelTensorShape get_output_shape(ReplicateAttrs const &attrs,
   }
   ParallelTensorShape output = input;
   output.at(attrs.replicate_dim).size *= attrs.replicate_degree;
+  output.at(attrs.replicate_dim).is_replica_dim =
+      (input.at(attrs.replicate_dim).degree > 1);
   return output;
 }
 
diff --git a/lib/op-attrs/src/reverse.cc b/lib/op-attrs/src/reverse.cc
index f418495acf..663150861f 100644
--- a/lib/op-attrs/src/reverse.cc
+++ b/lib/op-attrs/src/reverse.cc
@@ -9,7 +9,6 @@ ParallelTensorShape get_output_shape(ReverseAttrs const &attrs,
   if (attrs.axis < 0 || attrs.axis >= input_shape.num_dims()) {
     throw mk_runtime_error("ReverseAttrs: axis is invalid");
   }
-
   // output degree is same as input degree, because it's just reverse operation
   return input_shape;
 }
diff --git a/lib/op-attrs/src/softmax.cc b/lib/op-attrs/src/softmax.cc
index d9ab0c9a84..eff13aab59 100644
--- a/lib/op-attrs/src/softmax.cc
+++ b/lib/op-attrs/src/softmax.cc
@@ -8,7 +8,6 @@ ParallelTensorShape get_output_shape(SoftmaxAttrs const &attrs,
   if (input_shape.num_dims() < 2) {
     throw mk_runtime_error("SoftmaxAttrs: input_shape.num_dims() < 2");
   }
-
   return input_shape;
 }
 

From 7da6b505ebd88e06c1a9568dac9463f638794ea6 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Thu, 19 Oct 2023 19:25:57 +0000
Subject: [PATCH 56/69] some update

---
 lib/op-attrs/src/attention.cc    | 6 +++---
 lib/op-attrs/src/batch_matmul.cc | 2 +-
 lib/op-attrs/src/reduce.cc       | 3 ++-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index 61a5c79b0a..f89124a94f 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -58,9 +58,9 @@ TensorShape
 // according to the pytorch
 // https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html,
 // we consider the batch size
-// query: [seq_len, batch_size, embed_dim],
-// key: (seq_len, batch_size, embed_dim)
-// value: (seq_len, batch_size,embed_dim)
+// query: [replicate_num, seq_len, batch_size, embed_dim],4D, 
+// key: (replicate_num, seq_len, batch_size, embed_dim)
+// value: (replicate_num ,seq_len, batch_size,embed_dim)
 //  multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
 // output: (seq_len, batch_size, embed_dim)
 
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index e170e29053..17406268cd 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -4,7 +4,7 @@
 #include "utils/exception.h"
 
 namespace FlexFlow {
-// how to get the batch size? and lhs: [b, n, m], rhs: [b, m, p]
+// how to get the batch size? and lhs: [replicate_num, b, n, m], rhs: [b, m, p]
 // output: [b, n, p] //n == s1, m == s2
 //[b, n/2, m], [b, m, p/2] -> [b, n/2, p/2]
 //[b, n, m/2], [b, m/2, p] -> [b, n, p/2]
diff --git a/lib/op-attrs/src/reduce.cc b/lib/op-attrs/src/reduce.cc
index a6c78db75b..930ec42d76 100644
--- a/lib/op-attrs/src/reduce.cc
+++ b/lib/op-attrs/src/reduce.cc
@@ -3,10 +3,11 @@
 
 namespace FlexFlow {
 
+//
 ParallelTensorShape get_output_shape(ReduceAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   NOT_IMPLEMENTED()
-  // reduce is sum/max/min/mean, I think we just return 1D tensor
+  // reduce is sum/max/min/mean, I think we just return 1D tensor [1, 2, 4] => [7, ]
   // NOTE: how to implement this
 }
 

From 40ffcd9b8b2b25a53d84b2b1d22b6999768a28ed Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Fri, 27 Oct 2023 13:44:54 +0000
Subject: [PATCH 57/69]  add new batch matmul

---
 .../include/op-attrs/parallel_tensor_shape.h  |   2 +
 lib/op-attrs/src/batch_matmul.cc              | 102 +++++++++---------
 lib/op-attrs/src/parallel_tensor_shape.cc     |  10 ++
 3 files changed, 61 insertions(+), 53 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index ca980966e8..f30786f55d 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -27,6 +27,8 @@ struct ParallelTensorShape : public use_visitable_cmp<ParallelTensorShape> {
 
   int num_dims() const;
 
+  int get_volume() const; 
+  
   ParallelDim const &at(ff_dim_t const &) const;
   ParallelDim &at(ff_dim_t const &);
   ParallelDim const &operator[](ff_dim_t const &) const;
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index 17406268cd..38f37bead2 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -1,69 +1,65 @@
 #include "op-attrs/ops/batch_matmul.h"
 #include "op-attrs/ff_dim.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "utils/exception.decl.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
-// how to get the batch size? and lhs: [replicate_num, b, n, m], rhs: [b, m, p]
-// output: [b, n, p] //n == s1, m == s2
-//[b, n/2, m], [b, m, p/2] -> [b, n/2, p/2]
-//[b, n, m/2], [b, m/2, p] -> [b, n, p/2]
+
+//lhs: [<r1, dl1, true>, <b, 1, f> ,<n, dl3, false>, <m, dl4, false>] 
+//rhs:[<r2, dr1, true>, <b,1,f> ,<m, dr3, false>, <p, dr4,false>]
+//in the original tensor, we assume the dl1/dr1 is 1
+//output:[<r3, do1, true>, <b,1,f>, <n, do3, false>, <p,do4, false>]
+//how to decide the r3, d01, do3, do4
+//Note: Lsize = r1 * dl3 * dl4, Rsize = r2 * dr3 * dr4 , Rsize = Lsize
+//do3 = dl3, do4 = dr4
+//so, r3 = Lsize / do3 / do4 
+//r3 / do1 = r1 / dl1 
 ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
                                      ParallelTensorShape const &lhs,
                                      ParallelTensorShape const &rhs) {
-  ParallelTensorShape output_shape = lhs;
+if(lhs.num_dims() != 4 || rhs.num_dims() != 4) {
+  throw mk_runtime_error("rhs or lhs dimension is not 4");
+}
 
-  // check if the input is valid
-  if (!lhs.is_valid() || !rhs.is_valid()) {
-    throw mk_runtime_error(
-        "BatchMatmulAttrs::get_output_shape: input is invalid")
-  }
+int rl = lhs.at(ff_dim_t(0)).size ;// replicate_num of lhs
+int dl1 = lhs.at(ff_dim_t(0)).degree;//degree of 0 dimension
+int dl3 = lhs.at(ff_dim_t(3)).degree;//degree of third dimension
+int dr4 = rhs.at(ff_dim_t(4)).degree;//degree of fouth dimenstion
 
-  if (lhs.at(ff_dim_t(0)).size != rhs.at(ff_dim_t(0)).size) {
-    throw mk_runtime_error(
-        "BatchMatmulAttrs::get_output_shape: batch size is not equal");
-  }
-  if (lhs.at(ff_dim_t(2)).size != rhs.at(ff_dim_t(1)).size ||
-      lhs.at(ff_dim_t(1)).size != attrs.a_seq_length_dim ||
-      rhs.at(ff_dim_t(2)).size != attrs.b_seq_length_dim) {
+int lsize = lhs.get_volume();
+int rsize = rhs.get_volume();
+if(lsize != rsize) {
+  throw mk_runtime_error("BatchMatmulAttrs::get_output_shape, the volume of lhs and rhs are not matched ");
+}
+
+if (lhs.at(ff_dim_t(1)).size != rhs.at(ff_dim_t(1)).size) {
     throw mk_runtime_error(
-        "BatchMatmulAttrs::get_output_shape: third demension of lhs and second "
-        "dementions of rhs are not match");
-  }
-  output_shape.at(ff_dim_t(0)).size = lhs.at(ff_dim_t(0)).size; // batch size
-  output_shape.at(ff_dim_t(1)).size = lhs.at(ff_dim_t(1)).size;
-  output_shape.at(ff_dim_t(2)).size = rhs.at(ff_dim_t(2)).size;
+        "BatchMatmulAttrs::get_output_shape, batch size is not equal");
+}
+
+if(lhs.at(ff_dim_t(3)).size != rhs.at(ff_dim_t(3)).size ) {
+  throw mk_runtime_error(
+        "BatchMatmulAttrs::get_output_shape: forth demension of lhs and third dementions of rhs are not match");
+}
+
+//4D tensor
+ParallelTensorShape output_shape = lhs;
+
+output_shape.at(ff_dim_t(0)).size = lsize / (dl3 * dr4);
+output_shape.at(ff_dim_t(0)).degree = output_shape.at(ff_dim_t(0)).size / (rl / dl1);//this may have some problem
+output_shape.at(ff_dim_t(0)).is_replica_dim = true;
+
+output_shape.at(ff_dim_t(3)).size = lhs.at(ff_dim_t(3)).size;
+output_shape.at(ff_dim_t(3)).degree = dl3;
+output_shape.at(ff_dim_t(3)).is_replica_dim = false;
+
+output_shape.at(ff_dim_t(4)).size = rhs.at(ff_dim_t(4)).size();
+output_shape.at(ff_dim_t(4)).degree = dr4;
+output_shape.at(ff_dim_t(4)).is_replica_dim = false;
+
+return output_shape;
 
-  if (lhs.at(ff_dim_t(1)).degree == 1 && lhs.at(ff_dim_t(2)).degree == 1) {
-    // case 0: degree is 1, [b, n, m], rhs: [b, m, p] -> [b, n, p]
-    for (int i = 1; i < lhs.num_dims(); i++) {
-      output_shape.at(ff_dim_t(i)).degree = 1;
-      output_shape.at(ff_dim_t(i)).is_replica_dim = false;
-    }
-  } else if (lhs.at(ff_dim_t(1)).degree == 1 &&
-             lhs.at(ff_dim_t(2)).degree >
-                 1) { // case 1: [b, n, m/x], [b, m/x, p] => [b, n, y]
-    output_shape.at(ff_dim_t(1)).is_replica_dim = true;
-    output_shape.at(ff_dim_t(1)).degree = lhs.at(ff_dim_t(1)).degree;
-  } else if (lhs.at(ff_dim_t(1)).degree > 1 &&
-             lhs.at(ff_dim_t(2)).degree ==
-                 1) { // case 2: [b, n/x, m] [b m p/x] => [b n/x p/x]
-    output_shape.at(ff_dim_t(1)).is_replica_dim = true;
-    output_shape.at(ff_dim_t(2)).is_replica_dim = true;
-    output_shape.at(ff_dim_t(1)).degree = lhs.at(ff_dim_t(1)).degree;
-    output_shape.at(ff_dim_t(2)).degree = rhs.at(ff_dim_t(2)).degree;
-  } else if (lhs.at(ff_dim_t(1)).degree > 1 &&
-             lhs.at(ff_dim_t(2)).degree >
-                 1) { // case 3: [b n/x m/y] [b m/y p/x]=> [b n/x p/x]
-    output_shape.at(ff_dim_t(1)).is_replica_dim = true;
-    output_shape.at(ff_dim_t(2)).is_replica_dim = true;
-    output_shape.at(ff_dim_t(1)).degree = lhs.at(ff_dim_t(1)).degree;
-    output_shape.at(ff_dim_t(2)).degree = rhs.at(ff_dim_t(2)).degree;
-  } else {
-    throw mk_runtime_error("BatchMatmulAttrs::get_output_shape: not supported "
-                           "in BatchMatmulAttrs get_output_shape");
-  }
-  return output_shape;
 }
 
 /* bool BatchMatmulAttrs::is_valid( */
diff --git a/lib/op-attrs/src/parallel_tensor_shape.cc b/lib/op-attrs/src/parallel_tensor_shape.cc
index 9a36e7d11b..8f9514e58a 100644
--- a/lib/op-attrs/src/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/parallel_tensor_shape.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/ff_dim.h"
 #include "utils/containers.h"
 #include "utils/hash-utils.h"
 
@@ -13,6 +14,15 @@ static std::vector<ParallelDim> lift_dims(TensorDims const &dims) {
   return lifted_dims;
 }
 
+int ParallelTensorShape::get_volume() const {
+  int volume  = this->at(ff_dim_t(0)).size;
+  for(int i = 1; i < num_dims(); i++) {
+    volume *= this->at(ff_dim_t(0)).degree;
+  }
+
+  return volume;
+}
+
 ParallelTensorDims::ParallelTensorDims(TensorDims const &dims)
     : data(lift_dims(dims)) {}
 

From f68c83b60a43688787e507c5b5eb28782dddf3f1 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Fri, 27 Oct 2023 14:31:24 +0000
Subject: [PATCH 58/69] add concat

---
 .../include/op-attrs/parallel_tensor_shape.h  |  4 +-
 lib/op-attrs/src/attention.cc                 |  2 +-
 lib/op-attrs/src/batch_matmul.cc              | 83 ++++++++++---------
 lib/op-attrs/src/combine.cc                   |  5 +-
 lib/op-attrs/src/concat.cc                    | 18 +++-
 lib/op-attrs/src/parallel_tensor_shape.cc     |  4 +-
 lib/op-attrs/src/reduce.cc                    |  4 +-
 7 files changed, 69 insertions(+), 51 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index f30786f55d..e7df3b72df 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -27,8 +27,8 @@ struct ParallelTensorShape : public use_visitable_cmp<ParallelTensorShape> {
 
   int num_dims() const;
 
-  int get_volume() const; 
-  
+  int get_volume() const;
+
   ParallelDim const &at(ff_dim_t const &) const;
   ParallelDim &at(ff_dim_t const &);
   ParallelDim const &operator[](ff_dim_t const &) const;
diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index f89124a94f..cc6d9c3c48 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -58,7 +58,7 @@ TensorShape
 // according to the pytorch
 // https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html,
 // we consider the batch size
-// query: [replicate_num, seq_len, batch_size, embed_dim],4D, 
+// query: [replicate_num, seq_len, batch_size, embed_dim],4D,
 // key: (replicate_num, seq_len, batch_size, embed_dim)
 // value: (replicate_num ,seq_len, batch_size,embed_dim)
 //  multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
diff --git a/lib/op-attrs/src/batch_matmul.cc b/lib/op-attrs/src/batch_matmul.cc
index 38f37bead2..62c0f525e8 100644
--- a/lib/op-attrs/src/batch_matmul.cc
+++ b/lib/op-attrs/src/batch_matmul.cc
@@ -6,60 +6,63 @@
 
 namespace FlexFlow {
 
-//lhs: [<r1, dl1, true>, <b, 1, f> ,<n, dl3, false>, <m, dl4, false>] 
-//rhs:[<r2, dr1, true>, <b,1,f> ,<m, dr3, false>, <p, dr4,false>]
-//in the original tensor, we assume the dl1/dr1 is 1
-//output:[<r3, do1, true>, <b,1,f>, <n, do3, false>, <p,do4, false>]
-//how to decide the r3, d01, do3, do4
-//Note: Lsize = r1 * dl3 * dl4, Rsize = r2 * dr3 * dr4 , Rsize = Lsize
-//do3 = dl3, do4 = dr4
-//so, r3 = Lsize / do3 / do4 
-//r3 / do1 = r1 / dl1 
+// lhs: [<r1, dl1, true>, <b, 1, f> ,<n, dl3, false>, <m, dl4, false>]
+// rhs:[<r2, dr1, true>, <b,1,f> ,<m, dr3, false>, <p, dr4,false>]
+// in the original tensor, we assume the dl1/dr1 is 1
+// output:[<r3, do1, true>, <b,1,f>, <n, do3, false>, <p,do4, false>]
+// how to decide the r3, d01, do3, do4
+// Note: Lsize = r1 * dl3 * dl4, Rsize = r2 * dr3 * dr4 , Rsize = Lsize
+// do3 = dl3, do4 = dr4
+// so, r3 = Lsize / do3 / do4
+// r3 / do1 = r1 / dl1
 ParallelTensorShape get_output_shape(BatchMatmulAttrs const &attrs,
                                      ParallelTensorShape const &lhs,
                                      ParallelTensorShape const &rhs) {
-if(lhs.num_dims() != 4 || rhs.num_dims() != 4) {
-  throw mk_runtime_error("rhs or lhs dimension is not 4");
-}
+  if (lhs.num_dims() != 4 || rhs.num_dims() != 4) {
+    throw mk_runtime_error("rhs or lhs dimension is not 4");
+  }
 
-int rl = lhs.at(ff_dim_t(0)).size ;// replicate_num of lhs
-int dl1 = lhs.at(ff_dim_t(0)).degree;//degree of 0 dimension
-int dl3 = lhs.at(ff_dim_t(3)).degree;//degree of third dimension
-int dr4 = rhs.at(ff_dim_t(4)).degree;//degree of fouth dimenstion
+  int rl = lhs.at(ff_dim_t(0)).size;    // replicate_num of lhs
+  int dl1 = lhs.at(ff_dim_t(0)).degree; // degree of 0 dimension
+  int dl3 = lhs.at(ff_dim_t(3)).degree; // degree of third dimension
+  int dr4 = rhs.at(ff_dim_t(4)).degree; // degree of fouth dimenstion
 
-int lsize = lhs.get_volume();
-int rsize = rhs.get_volume();
-if(lsize != rsize) {
-  throw mk_runtime_error("BatchMatmulAttrs::get_output_shape, the volume of lhs and rhs are not matched ");
-}
+  int lsize = lhs.get_volume();
+  int rsize = rhs.get_volume();
+  if (lsize != rsize) {
+    throw mk_runtime_error("BatchMatmulAttrs::get_output_shape, the volume of "
+                           "lhs and rhs are not matched ");
+  }
 
-if (lhs.at(ff_dim_t(1)).size != rhs.at(ff_dim_t(1)).size) {
+  if (lhs.at(ff_dim_t(1)).size != rhs.at(ff_dim_t(1)).size) {
     throw mk_runtime_error(
         "BatchMatmulAttrs::get_output_shape, batch size is not equal");
-}
+  }
 
-if(lhs.at(ff_dim_t(3)).size != rhs.at(ff_dim_t(3)).size ) {
-  throw mk_runtime_error(
-        "BatchMatmulAttrs::get_output_shape: forth demension of lhs and third dementions of rhs are not match");
-}
-
-//4D tensor
-ParallelTensorShape output_shape = lhs;
+  if (lhs.at(ff_dim_t(3)).size != rhs.at(ff_dim_t(3)).size) {
+    throw mk_runtime_error(
+        "BatchMatmulAttrs::get_output_shape: forth demension of lhs and third "
+        "dementions of rhs are not match");
+  }
 
-output_shape.at(ff_dim_t(0)).size = lsize / (dl3 * dr4);
-output_shape.at(ff_dim_t(0)).degree = output_shape.at(ff_dim_t(0)).size / (rl / dl1);//this may have some problem
-output_shape.at(ff_dim_t(0)).is_replica_dim = true;
+  // 4D tensor
+  ParallelTensorShape output_shape = lhs;
 
-output_shape.at(ff_dim_t(3)).size = lhs.at(ff_dim_t(3)).size;
-output_shape.at(ff_dim_t(3)).degree = dl3;
-output_shape.at(ff_dim_t(3)).is_replica_dim = false;
+  output_shape.at(ff_dim_t(0)).size = lsize / (dl3 * dr4);
+  output_shape.at(ff_dim_t(0)).degree =
+      output_shape.at(ff_dim_t(0)).size /
+      (rl / dl1); // this may have some problem
+  output_shape.at(ff_dim_t(0)).is_replica_dim = true;
 
-output_shape.at(ff_dim_t(4)).size = rhs.at(ff_dim_t(4)).size();
-output_shape.at(ff_dim_t(4)).degree = dr4;
-output_shape.at(ff_dim_t(4)).is_replica_dim = false;
+  output_shape.at(ff_dim_t(3)).size = lhs.at(ff_dim_t(3)).size;
+  output_shape.at(ff_dim_t(3)).degree = dl3;
+  output_shape.at(ff_dim_t(3)).is_replica_dim = false;
 
-return output_shape;
+  output_shape.at(ff_dim_t(4)).size = rhs.at(ff_dim_t(4)).size();
+  output_shape.at(ff_dim_t(4)).degree = dr4;
+  output_shape.at(ff_dim_t(4)).is_replica_dim = false;
 
+  return output_shape;
 }
 
 /* bool BatchMatmulAttrs::is_valid( */
diff --git a/lib/op-attrs/src/combine.cc b/lib/op-attrs/src/combine.cc
index d851c186c8..ee77fd08b6 100644
--- a/lib/op-attrs/src/combine.cc
+++ b/lib/op-attrs/src/combine.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/combine.h"
+#include "utils/exception.decl.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
@@ -6,9 +7,11 @@ ParallelTensorShape
     get_output_shape_shape(CombineAttrs const &attrs,
                            ParallelTensorShape const &input_shape) {
   ParallelTensorShape output_shape = input_shape;
+  /*
   output_shape.at(attrs.combine_dim).degree /= attrs.combine_degree;
   output_shape.at(attrs.combine_dim).is_replica_dim =
-      output_shape.at(attrs.combine_dim).degree > 1;
+      output_shape.at(attrs.combine_dim).degree > 1;*/
+  NOT_IMPLEMENTED();
   return output_shape;
 }
 
diff --git a/lib/op-attrs/src/concat.cc b/lib/op-attrs/src/concat.cc
index 47807b9c1a..5efe8855d8 100644
--- a/lib/op-attrs/src/concat.cc
+++ b/lib/op-attrs/src/concat.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/concat.h"
+#include "utils/exception.decl.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -12,11 +13,22 @@ ParallelTensorShape
                              "range or input is invalid");
     }
   }
+
+  int dims = inputs[0].num_dims();
+  for (int i = 1; i < inputs.size(); i++) {
+    if (inputs[i].num_dims() != dims) {
+      throw mk_runtime_error(" the input dims not matched at i:", i);
+    }
+  }
+
   for (auto &i : inputs) {
-    output.at(attrs.axis).size += i.at(attrs.axis).size;
+    output.at(ff_dim_t(attrs.axis)).size += i.at(ff_dim_t(attrs.axis)).size;
+  }
+  output.at(ff_dim_t(0)).is_replica_dim = true;
+  // note: how to decide the degee?
+  for (int i = 1; i < output.num_dims(); i++) {
+    output.at(ff_dim_t(i)).is_replica_dim = false;
   }
-  output.at(attrs.axis).degree = inputs[0].at(attrs.axis).degree;
-  output.at(attrs.axis).is_replica_dim = inputs[0].at(attrs.axis).degree >= 1;
   return output;
 }
 
diff --git a/lib/op-attrs/src/parallel_tensor_shape.cc b/lib/op-attrs/src/parallel_tensor_shape.cc
index 8f9514e58a..5848991c13 100644
--- a/lib/op-attrs/src/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/parallel_tensor_shape.cc
@@ -15,8 +15,8 @@ static std::vector<ParallelDim> lift_dims(TensorDims const &dims) {
 }
 
 int ParallelTensorShape::get_volume() const {
-  int volume  = this->at(ff_dim_t(0)).size;
-  for(int i = 1; i < num_dims(); i++) {
+  int volume = this->at(ff_dim_t(0)).size;
+  for (int i = 1; i < num_dims(); i++) {
     volume *= this->at(ff_dim_t(0)).degree;
   }
 
diff --git a/lib/op-attrs/src/reduce.cc b/lib/op-attrs/src/reduce.cc
index 930ec42d76..79fa6d7598 100644
--- a/lib/op-attrs/src/reduce.cc
+++ b/lib/op-attrs/src/reduce.cc
@@ -7,8 +7,8 @@ namespace FlexFlow {
 ParallelTensorShape get_output_shape(ReduceAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   NOT_IMPLEMENTED()
-  // reduce is sum/max/min/mean, I think we just return 1D tensor [1, 2, 4] => [7, ]
-  // NOTE: how to implement this
+  // reduce is sum/max/min/mean, I think we just return 1D tensor [1, 2, 4] =>
+  // [7, ] NOTE: how to implement this
 }
 
 } // namespace FlexFlow

From 5ef2497921b817213f42d514afb99d5151645898 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Fri, 27 Oct 2023 14:44:03 +0000
Subject: [PATCH 59/69] add conv_2d

---
 lib/op-attrs/src/conv_2d.cc | 64 +++++++++++++------------------------
 1 file changed, 22 insertions(+), 42 deletions(-)

diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index 15566f9005..5f86ffa750 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -84,61 +84,41 @@ std::vector<ParallelDimMappingRecord>
   return mappings;
 }
 
-// according to pytorch, the input shape: [b, input_channel, input_h, input_w]
-// kernel shape: [output_channel, input_channel, kernel_h, kernel_w]
-// we may have stide_h and padding_h
-// output shape: [b, output_channel, output_h, output_w]
-// output_h = (input_h + 2 * padding_h - kernel_h) / stride_h + 1
-// output_w = (input_w + 2 * padding_w - kernel_w) / stride_w + 1
+// input: (<ri, di1, t>, <b, 1, f>, <input_channel, di3, f>, < input_h, di4, f>,
+// <input_w, di5, f> ) kernel(Conv2DAttrs):  out_channels, kernel_h, kernel_w,
+// stride_h, stride_w, padding_h, padding_w, output shape:(<ro, do1, t>, <b,
+// 1,f>, <output_channel, do3, f>, <output_h, do4, f>, <output_w, do5,f>)
+//  output_h = (input_h + 2 * padding_h - kernel_h) / stride_h + 1
+//  output_w = (input_w + 2 * padding_w - kernel_w) / stride_w + 1
+// assert: for the kernel, dk1 == dk2=dk4=dk4=dk5=1
+// question:how to decide the ro/do3/do4/do5?
+// I think: do3= di3, di4= do4, di5 = do5, do1=di1, ro=ri
 ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
                                      ParallelTensorShape const &input) {
-  ParallelTensorShape output = input;
-  if (input.num_dims() != 4) {
+  if (input.num_dims() != 5) {
     throw mk_runtime_error("Conv2DAttrs::get_output_shape: input is invalid");
   }
-
-  if (attrs.kernel_h > input.at(ff_dim_t(2)).size ||
-      attrs.kernel_w > input.at(ff_dim_t(3)).size) {
+  if (attrs.kernel_h > input.at(ff_dim_t(3)).size ||
+      attrs.kernel_w > input.at(ff_dim_t(4)).size) {
     throw mk_runtime_error(
         "Conv2DAttrs::get_output_shape: kernel size is larger than input size");
   }
 
-  output.at(ff_dim_t(1)).size = attrs.out_channels;
-  output.at(ff_dim_t(2)).size =
-      (input.at(ff_dim_t(2)).size + 2 * attrs.padding_h - attrs.kernel_h) /
+  ParallelTensorShape output = input;
+  output.at(ff_dim_t(0)).is_replica_dim = true;
+  output.at(ff_dim_t(2)).size = attrs.out_channels;
+  output.at(ff_dim_t(3)).size =
+      (input.at(ff_dim_t(3)).size + 2 * attrs.padding_h - attrs.kernel_h) /
           attrs.stride_h +
       1;
-  output.at(ff_dim_t(3)).size =
-      (input.at(ff_dim_t(3)).size + 2 * attrs.padding_w - attrs.kernel_w) /
+  output.at(ff_dim_t(4)).size =
+      (input.at(ff_dim_t(4)).size + 2 * attrs.padding_w - attrs.kernel_w) /
           attrs.stride_w +
       1;
-
-  if (input.at(ff_dim_t(2)).size == 1 && input.at(ff_dim_t(3)).size == 1) {
-    // case 1  input degree is 1, like 1GPU
-    output.at(ff_dim_t(0)).is_replica_dim = false;
-  } else if (input.at(ff_dim_t(2)).size > 1 &&
-             input.at(ff_dim_t(3)).size == 1) {
-    // case 2: [b, input_channel, input_h/x, input_w], [output_channel,
-    // input_channel, kernel_h, kernel_w] => [b, output_channel, output_h/x,
-    // output_w]
-    output.at(ff_dim_t(2)).is_replica_dim = true;
-    output.at(ff_dim_t(2)).degree = input.at(ff_dim_t(2)).degree;
-  } else if (input.at(ff_dim_t(2)).size == 1 &&
-             input.at(ff_dim_t(3)).size > 1) {
-    // case 3: [b, input_channel, input_h, input_w / x] [output_channel,
-    // input_channel, kernel_h, kernel_w / x] => [b, output_channel, output_h,
-    // output_w / x]
-    output.at(ff_dim_t(3)).is_replica_dim = true;
-    output.at(ff_dim_t(3)).degree = input.at(ff_dim_t(3)).degree;
-  } else if (input.at(ff_dim_t(2)).size > 1 && input.at(ff_dim_t(3)).size > 1) {
-    for (int i = 2; i < input.num_dims(); i++) {
-      output.at(ff_dim_t(i)).is_replica_dim = true;
-      output.at(ff_dim_t(i)).degree = input.at(ff_dim_t(i)).degree;
-    }
-  } else {
-    throw mk_runtime_error("Conv2DAttrs::get_output_shape: not supported in "
-                           "Conv2DAttrs get_output_shape");
+  for (int i = 1; i < output.num_dims(); i++) {
+    output.at(ff_dim_t(i)).is_replica_dim = false;
   }
+
   return output;
 }
 

From 7e23eb3302ab744f041540bc0d3462e345585cde Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Fri, 27 Oct 2023 14:46:15 +0000
Subject: [PATCH 60/69] add element binary

---
 lib/op-attrs/src/element_binary.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/op-attrs/src/element_binary.cc b/lib/op-attrs/src/element_binary.cc
index 77ddc016b8..57236bb04f 100644
--- a/lib/op-attrs/src/element_binary.cc
+++ b/lib/op-attrs/src/element_binary.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/element_binary.h"
+#include "op-attrs/ff_dim.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -7,7 +8,9 @@ ParallelTensorShape get_output_shape(ElementBinaryAttrs const &atts,
                                      ParallelTensorShape const &lhs,
                                      ParallelTensorShape const &rhs) {
   ParallelTensorShape output = lhs.num_dims() >= rhs.num_dims() ? lhs : rhs;
-  for (int i = 0; i < output.num_dims(); i++) {
+  // how to decide its degree  and size for replicate_num
+  output.at(ff_dim_t(0)).is_replica_dim = false;
+  for (int i = 1; i < output.num_dims(); i++) {
     if (i >= lhs.num_dims()) {
       output.at(ff_dim_t(i)) = rhs.at(ff_dim_t(i));
     } else if (i >= rhs.num_dims()) {

From 9bb4de69bfcbb860575162863b3b9f752fc4e9a5 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sat, 28 Oct 2023 01:02:30 +0000
Subject: [PATCH 61/69] add embedding

---
 lib/op-attrs/src/embedding.cc | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/lib/op-attrs/src/embedding.cc b/lib/op-attrs/src/embedding.cc
index 3aba747036..8407eeebfd 100644
--- a/lib/op-attrs/src/embedding.cc
+++ b/lib/op-attrs/src/embedding.cc
@@ -1,16 +1,38 @@
 #include "op-attrs/ops/embedding.h"
+#include "op-attrs/ff_dim.h"
+#include "op-attrs/parallel_dim.h"
+#include "op-attrs/parallel_tensor_dims.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
 // pytorch nn.Embedding
 // Embedding OP: (num_embeddings, embedding_dim) (num_entries, out_channels)
-// Input: (batch_size, seq_len)
-// Output: (batch_size, seq_len, embedding_dim)
-ParallelTensorShape get_output_shape(EmbeddingAttrs const &atts,
+// input:(<ri, di1, t>, < b, di2, f>, < seq_len, di3, f>)
+// EmbeddingAttrs:req<int> num_entries, out_channels;
+// output:(<ro, do1, t>, <b, do2, f>, <seq_len, do3, f>, <embedding_dim, do4,
+// f>)
+ParallelTensorShape get_output_shape(EmbeddingAttrs const &attrs,
                                      ParallelTensorShape const &input) {
-  ParallelTensorShape output = input;
-  output.at(ff_dim_t(2)).size = atts.out_channels;
-  // output degree is same as input degree
+  if (input.num_dims() != 3) {
+    throw mk_runtime_error("for embedding, input shape must be 3D");
+  }
+
+  std::vector<ParallelDim> data;
+  data.resize(4);
+  data[0] = input.at(ff_dim_t(0));
+  data[0].is_replica_dim = true;
+  data[1] = input.at(ff_dim_t(1));
+  data[2] = input.at(ff_dim_t(2));
+  data[3].size = attrs.out_channels; // TODO:what's the embedding_dim?
+  data[3].is_replica_dim = false;
+
+  ParallelTensorShape output = ParallelTensorShape(
+      ParallelTensorDims(TensorDims(data.begin(), data.end())),
+      attrs.data_type);
+
   return output;
 }
 

From 51a9cb748884b1a5c1945eb25c065b12eff82e46 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sat, 28 Oct 2023 01:11:35 +0000
Subject: [PATCH 62/69] add flat

---
 lib/op-attrs/src/flat.cc | 45 +++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/lib/op-attrs/src/flat.cc b/lib/op-attrs/src/flat.cc
index 51c29ec5b7..0cfe71b398 100644
--- a/lib/op-attrs/src/flat.cc
+++ b/lib/op-attrs/src/flat.cc
@@ -1,6 +1,8 @@
 #include "op-attrs/ops/flat.h"
+#include "op-attrs/ff_dim.h"
 #include "parallel_dim_mapping_record.h"
 #include "parallel_dim_mapping_record_solver.h"
+#include "utils/exception.h"
 #include <cassert>
 
 namespace FlexFlow {
@@ -17,20 +19,39 @@ constexpr int NUMDIM = 3, CHANNEL = 0, SAMPLE = 1, REPLICA = 2;
 // flat is like the pytorch view
 // tensor = torch.randn(2, 3, 4)  ,flattened_tensor = tensor.view(-1) #shape:
 // (24)
+// input: (<ri, di, t>, <x0, d1, f>, <x1,d2, f>, ......)
+// assume d1=d2=d3
+// output: 2d dimention (<ri, di, t>, <x0+x1+x2+x3, d0, f> )
 ParallelTensorShape get_output_shape(FlatAttrs const &attrs,
                                      ParallelTensorShape const &input) {
-  ParallelTensorShape output_shape(input.dims, input.data_type);
-
-  output_shape.at(ff_dim_t(Output::CHANNEL)).size =
-      input.at(ff_dim_t(Input::CHANNEL)).size *
-      input.at(ff_dim_t(Input::HEIGHT)).size *
-      input.at(ff_dim_t(Input::WIDTH)).size;
-  output_shape.at(ff_dim_t(Output::CHANNEL)).degree =
-      input.at(ff_dim_t(Input::CHANNEL)).degree;
-  output_shape.at(ff_dim_t(Output::CHANNEL)).is_replica_dim =
-      (input.at(ff_dim_t(Input::CHANNEL)).degree > 1);
-
-  return output_shape;
+  if (input.num_dims() < 2) {
+    throw mk_runtime_error("for flat,its dims must greater than 2");
+  }
+
+  int degree = input.at(ff_dim_t(1)).degree;
+  for (int i = 1; i < input.num_dims(); i++) {
+    if (degree != input.at(ff_dim_t(i)).degree) {
+      throw mk_runtime_error(
+          "for flat, all degree should be equal, but elemement ", i, " not");
+    }
+  }
+  std::vector<ParallelDim> data;
+  data.resize(2);
+  data[0] = input.at(ff_dim_t(0));
+  data[0].is_replica_dim = true;
+  data[1].degree = input.at(ff_dim_t(1)).degree;
+  data[1].size = input.at(ff_dim_t(1)).size;
+  data[1].is_replica_dim = false;
+
+  for (int i = 2; i < input.num_dims(); i++) {
+    data[1].size *= input.at(ff_dim_t(i)).size;
+  }
+
+  ParallelTensorShape output = ParallelTensorShape(
+      ParallelTensorDims(TensorDims(data.begin(), data.end())),
+      input.data_type);
+
+  return output;
 }
 
 /* bool FlatAttrs::is_valid(ParallelTensorShape const &input) const { */

From cdb38d0ed5ec09025151177e502493a297193747 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sat, 28 Oct 2023 01:45:24 +0000
Subject: [PATCH 63/69] add pool2d

---
 lib/op-attrs/include/op-attrs/ops/pool_2d.h |  1 -
 lib/op-attrs/src/conv_2d.cc                 | 11 ++-
 lib/op-attrs/src/gather.cc                  | 36 +++-----
 lib/op-attrs/src/groupby.cc                 | 42 +++++----
 lib/op-attrs/src/layer_norm.cc              |  2 +-
 lib/op-attrs/src/linear.cc                  | 47 +++-------
 lib/op-attrs/src/pool_2d.cc                 | 98 +++++++--------------
 lib/op-attrs/src/topk.cc                    |  3 +-
 8 files changed, 91 insertions(+), 149 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d.h b/lib/op-attrs/include/op-attrs/ops/pool_2d.h
index b688be85f5..3bc862c481 100644
--- a/lib/op-attrs/include/op-attrs/ops/pool_2d.h
+++ b/lib/op-attrs/include/op-attrs/ops/pool_2d.h
@@ -17,7 +17,6 @@ struct Pool2DAttrs {
   req<int> kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w;
   req<PoolOp> pool_type;
   req<Activation> activation;
-  bool is_valid(ParallelTensorShape const &) const;
 };
 FF_VISITABLE_STRUCT(Pool2DAttrs,
                     kernel_h,
diff --git a/lib/op-attrs/src/conv_2d.cc b/lib/op-attrs/src/conv_2d.cc
index 5f86ffa750..75c61a82af 100644
--- a/lib/op-attrs/src/conv_2d.cc
+++ b/lib/op-attrs/src/conv_2d.cc
@@ -85,9 +85,14 @@ std::vector<ParallelDimMappingRecord>
 }
 
 // input: (<ri, di1, t>, <b, 1, f>, <input_channel, di3, f>, < input_h, di4, f>,
-// <input_w, di5, f> ) kernel(Conv2DAttrs):  out_channels, kernel_h, kernel_w,
-// stride_h, stride_w, padding_h, padding_w, output shape:(<ro, do1, t>, <b,
-// 1,f>, <output_channel, do3, f>, <output_h, do4, f>, <output_w, do5,f>)
+// <input_w, di5, f> )
+
+// kernel(Conv2DAttrs):  out_channels, kernel_h, kernel_w, stride_h, stride_w,
+// padding_h, padding_w,
+
+// output shape:(<ro, do1, t>, <b, 1,f>, <output_channel, do3, f>, <output_h,
+// do4, f>, <output_w, do5,f>)
+
 //  output_h = (input_h + 2 * padding_h - kernel_h) / stride_h + 1
 //  output_w = (input_w + 2 * padding_w - kernel_w) / stride_w + 1
 // assert: for the kernel, dk1 == dk2=dk4=dk4=dk5=1
diff --git a/lib/op-attrs/src/gather.cc b/lib/op-attrs/src/gather.cc
index ed7e5abd7b..7402bdc67c 100644
--- a/lib/op-attrs/src/gather.cc
+++ b/lib/op-attrs/src/gather.cc
@@ -1,23 +1,8 @@
 #include "op-attrs/ops/gather.h"
-#include "utils/exception.decl.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-bool GatherAttrs::is_valid(ParallelTensorShape const &lhs,
-                           ParallelTensorShape const &rhs) const {
-  if (lhs.dims.num_dims() != rhs.dims.num_dims()) {
-    return false;
-  }
-  for (auto i : lhs.dims) {
-    if (ff_dim_t(i.size) != this->dim &&
-        lhs.at(ff_dim_t(i.size)).size < rhs.at(ff_dim_t(i.size)).size) {
-      return false;
-    }
-  }
-  return true;
-}
-
 // https://pytorch.org/docs/stable/generated/torch.gather.html
 //  todo: why return a vector?
 std::vector<ParallelTensorShape>
@@ -26,25 +11,24 @@ std::vector<ParallelTensorShape>
                       ParallelTensorShape const &index) {
   if (input.num_dims() != index.num_dims()) {
     throw mk_runtime_error(
-        "Gather: input and index must have the same number of dimensions");
+        "for gather, the dimensions of input and index are not match");
   }
 
-  for (int i = 0; i < input.num_dims(); i++) {
+  for (int i = 1; i < input.num_dims(); i++) {
     if (i != attrs.dim &&
         input.at(ff_dim_t(i)).size <= index.at(ff_dim_t(i)).size) {
       throw mk_runtime_error(
           "Gather: index.size(d) <= input.size(d) for all dimensions d != dim");
     }
-  }
-
-  ParallelTensorShape output = input;
 
-  std::vector<ParallelTensorShape> results;
-  // NOTE(lambda):why return a vector?
-  results.push_back(output);
-  return results;
+    ParallelTensorShape output = index;
+    output.at(ff_dim_t(0)) = input.at(ff_dim_t(0));
+    std::vector<ParallelTensorShape> results;
+    // NOTE(lambda):why return a vector?
+    results.push_back(output);
+    return results;
+  }
 }
-
 /* bool GatherAttrs::is_valid(ParallelTensorShape const &lhs,
  * ParallelTensorShape const &rhs) const { */
 /*   if (lhs.num_dims() != rhs.num_dims()) { */
diff --git a/lib/op-attrs/src/groupby.cc b/lib/op-attrs/src/groupby.cc
index 64d7797771..17c091e02e 100644
--- a/lib/op-attrs/src/groupby.cc
+++ b/lib/op-attrs/src/groupby.cc
@@ -1,22 +1,32 @@
 #include "op-attrs/ops/groupby.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
-/*
-import torch
-data = torch.tensor([10, 20, 30, 40, 50, 60, 70, 80])
-# group index tensor group_indices
-group_indices = torch.tensor([0, 1, 0, 2, 1, 2, 0, 1])
-
-# groupby operator
-unique_indices, unique_inverse_indices = torch.unique(group_indices,
-return_inverse=True) print(f"unique_indices: {unique_indices} and
-unique_inverse_indices: {unique_inverse_indices}") grouped_data = [] for i in
-unique_indices: # use unique_inverse_indices group_data =
-data[unique_inverse_indices == i] grouped_data.append(group_data) for i, group
-in enumerate(grouped_data): print(f"Group {i}: {group}")
-*/
+// import torch
+// data = torch.tensor([10, 20, 30, 40, 50, 60, 70, 80])
+// # group index tensor group_indices
+// group_indices = torch.tensor([0, 1, 0, 2, 1, 2, 0, 1])
+
+// # groupby operator
+// unique_indices, unique_inverse_indices =
+// torch.unique(group_indices,return_inverse=True)
+
+// print(f"unique_indices: {unique_indices} and unique_inverse_indices:
+// {unique_inverse_indices}")
+
+// grouped_data = []
+
+// for i in unique_indices: # use unique_inverse_indices
+//     group_data = data[unique_inverse_indices == i]
+//     grouped_data.append(group_data)
+
+// for i, group in enumerate(grouped_data):
+//     print(f"Group {i}: {group}")
+
+// Group 0: tensor([10, 30, 70])
+//  Group 1: tensor([20, 50, 80])
+//  Group 2: tensor([40, 60])
 
 ParallelTensorShape get_output_shape(Group_byAttrs const &attrs,
                                      ParallelTensorShape const &input_shape,
@@ -25,7 +35,7 @@ ParallelTensorShape get_output_shape(Group_byAttrs const &attrs,
     throw mk_runtime_error(
         "Group_by: input and index must have the same number of dimensions");
   }
-  // degree of output is same as input's
+  // Note: how  to decide the groupby output shape?
   return input_shape;
 }
 
diff --git a/lib/op-attrs/src/layer_norm.cc b/lib/op-attrs/src/layer_norm.cc
index d98562219f..43211fbf24 100644
--- a/lib/op-attrs/src/layer_norm.cc
+++ b/lib/op-attrs/src/layer_norm.cc
@@ -9,7 +9,7 @@ ParallelTensorShape get_output_shape(LayerNormAttrs const &attrs,
   if (input.num_dims() < 2) {
     throw mk_runtime_error("LayerNorm: input must have at least 2 dimensions");
   }
-  // output degree is same as input degree
+  // output shape is smae as input
   return input_shape;
 }
 
diff --git a/lib/op-attrs/src/linear.cc b/lib/op-attrs/src/linear.cc
index ec0f5dd235..6c1748f517 100644
--- a/lib/op-attrs/src/linear.cc
+++ b/lib/op-attrs/src/linear.cc
@@ -11,44 +11,21 @@ namespace FlexFlow {
 //  pytorch linearattrs: should be {input_channels, output_channels}
 //  pytorch: output shape:{batch_size, output_channels}
 //  question: the Linearattrs doesn't have input_channels
-ParallelTensorShape get_output_shape(LinearAttrs const &atts,
-                                     ParallelTensorShape const &input) {
+// input: (<ri, di1, t>, <b, di2, f>, <input_channels, di3, f>)
+// linearattrs: should be {input_channels, output_channels}
+// the Linearattrs doesn't have input_channels, just have output_channels
+// output:(<ro,do1, t>, <b, do2, f>, <output_channels, do3, f>>
+// I think do1 = di1, do = ri, do2= di2, do3 = di3
 
-  ParallelTensorShape out_shape = input;
-  if (input.num_dims() != 2) {
-    throw mk_runtime_error("LinearAttrs: input shape should be 2D");
+ParallelTensorShape get_output_shape(LinearAttrs const &attrs,
+                                     ParallelTensorShape const &input) {
+  ParallelTensorShape output_shape = input;
+  if (input.num_dims() != 3) {
+    throw mk_runtime_error("LinearAttrs: input shape should be 3D");
   }
 
-  out_shape.at(ff_dim_t(1)).size = atts.out_channels;
-  // linear shoud consider the degree
-  // case 1: input:[N, K], weight:[K, M], degree is 1
-  if (input.at(ff_dim_t(0)).degree == 1 && input.at(ff_dim_t(1)).degree == 1) {
-    for (int i = 0; i < input.num_dims(); i++) {
-      out_shape.at(ff_dim_t(i)).is_replica_dim = false;
-      out_shape.at(ff_dim_t(i)).degree = 1;
-    }
-  } else if (input.at(ff_dim_t(0)).degree == 1 &&
-             input.at(ff_dim_t(1)).degree > 1) {
-    // case 2: input [N, k/x], weight [k/x, M], output [N, M], degree is x
-    out_shape.at(ff_dim_t(1)).degree = input.at(ff_dim_t(1)).degree;
-    out_shape.at(ff_dim_t(1)).is_replica_dim = true;
-  } else if (input.at(ff_dim_t(0)).degree > 1 &&
-             input.at(ff_dim_t(1)).degree == 1) {
-    // case 3: input [N/X, K], weight [K, M/X], output [N/X, M], degree is X
-    out_shape.at(ff_dim_t(0)).degree = input.at(ff_dim_t(0)).degree;
-    out_shape.at(ff_dim_t(0)).is_replica_dim = true;
-  } else if (input.at(ff_dim_t(0)).degree > 1 &&
-             input.at(ff_dim_t(1)).degree > 1) {
-    // case 4: input [N/X, K/Y], weight [K/Y, M/X], output [N/X, M/X], degree is
-    // X
-    for (int i = 0; i < input.num_dims(); i++) {
-      out_shape.at(ff_dim_t(i)).is_replica_dim = true;
-      out_shape.at(ff_dim_t(i)).degree = input.at(ff_dim_t(i)).degree;
-    }
-  } else {
-    throw mk_runtime_error("LinearAttrs: degree is not supported");
-  }
-  return out_shape;
+  output_shape.at(ff_dim_t(2)).size = attrs.out_channels;
+  return output_shape;
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/pool_2d.cc b/lib/op-attrs/src/pool_2d.cc
index ec6253a0b1..23e1c6dd3d 100644
--- a/lib/op-attrs/src/pool_2d.cc
+++ b/lib/op-attrs/src/pool_2d.cc
@@ -41,85 +41,53 @@ static ParallelDimMappingSolution
   return solve_parallel_dim_mappings(construct_mappings(input), {input}, 0, 1);
 }
 
-bool Pool2DAttrs::is_valid(ParallelTensorShape const &input) const {
-  if (!input.is_valid()) {
-    return false;
-  }
-  return true;
-}
-
 // https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html
 // https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html
-//  pytorch: we have two type of pool2d, maxpool2d and avgpool2d
-//  input shape: (batch_size, channels, input_height, input_width)
-//  for avgpool2d, output shape: (batch_size, channels, 1, 1)
-//  for maxpool2d, output shape: (batch_size, channels, output_height,
-//  output_width) output_height = (input_height + 2 * padding_h - kernel_h) /
-//  stride_h + 1 output_width = (input_width + 2 * padding_w - kernel_w) /
-//  stride_w + 1
+// input:(< ri, di1, t>, <b, di2, f>, <channels, di3, f>, <input_height, di4,
+// f>, <input_width, di5, f>)
+
+// Pool2DAttrs: req<int> kernel_h, kernel_w, stride_h, stride_w, padding_h,
+// padding_w;
+
+// for avgpool2d: output shape:(< ri, di1, t>, <b, di2, f>, <channels, di3, f>,
+// <1,1,f>, <1,1,f> )
+
+// for maxpool2d, output shape:(< ri, di1, t>, <b, di2, f>, <channels, di3, f>,
+// <output_height, di4, f>, <output_width, di5, f>)
+
+// output_height = (input_height + 2 * padding_h - kernel_h) / stride_h + 1
+// output_width = (input_width + 2 * padding_w - kernel_w) / stride_w + 1
 ParallelTensorShape get_output_shape(Pool2DAttrs const &attrs,
                                      ParallelTensorShape const &input) {
-
-  if (input.num_dims() != 4) {
-    throw mk_runtime_error("Pool2DAttrs: input shape should be 4D");
+  if (input.num_dims() != 5) {
+    throw mk_runtime_error("Pool2DAttrs, input shape should be 5D");
   }
-  ParallelTensorShape output_shape = input;
+
   if (attrs.pool_type == PoolOp::AVG) {
-    output_shape.at(ff_dim_t(2)).size = 1;
-    output_shape.at(ff_dim_t(3)).size = 1;
+    std::vector<ParallelDim> data;
+    data.resize(4);
+    data[0] = input.at(ff_dim_t(0));
+    data[1] = input.at(ff_dim_t(1));
+    data[2] = {1, 1, false};
+    data[3] = {1, 1, false};
+    ParallelTensorShape output = ParallelTensorShape(
+        ParallelTensorDims(TensorDims(data.begin(), data.end())),
+        input.data_type);
+    return output;
   } else if (attrs.pool_type == PoolOp::MAX) {
-    output_shape.at(ff_dim_t(2)).size =
-        (input.at(ff_dim_t(2)).size + 2 * attrs.padding_h - attrs.kernel_h) /
+    ParallelTensorShape output_shape = input;
+    output_shape.at(ff_dim_t(3)).size =
+        (input.at(ff_dim_t(3)).size + 2 * attrs.padding_h - attrs.kernel_h) /
             attrs.stride_h +
         1;
-    output_shape.at(ff_dim_t(3)).size =
-        (input.at(ff_dim_t(3)).size + 2 * attrs.padding_w - attrs.kernel_w) /
+    output_shape.at(ff_dim_t(4)).size =
+        (input.at(ff_dim_t(4)).size + 2 * attrs.padding_w - attrs.kernel_w) /
             attrs.stride_w +
         1;
+    return output_shape;
   } else {
     throw mk_runtime_error("Pool2DAttrs: pool type is not supported");
   }
-
-  // case 1: input:[N, C, H, W], output:[N, C, 1, 1], degree is 1 for avgpool2d
-  // input: [N, C, H, W], output: [N, C, output_height, output_width], degree is
-  // 1 for maxpool2d
-  if (input.at(ff_dim_t(2)).degree == 1 && input.at(ff_dim_t(3)).degree == 1) {
-    for (int i = 2; i < input.num_dims(); i++) {
-      output_shape.at(ff_dim_t(i)).is_replica_dim = false;
-      output_shape.at(ff_dim_t(i)).degree = 1;
-    }
-  } else if (input.at(ff_dim_t(2)).degree > 1 &&
-             input.at(ff_dim_t(3)).degree == 1) {
-    // case 2: input [N, C, H/X, W] output [N, C, 1, 1], degree is X
-    // input [N, C, H/X, W] output [N, C, output_height/x, output_width], degree
-    // is X
-    output_shape.at(ff_dim_t(2)).degree = input.at(ff_dim_t(2)).degree;
-    output_shape.at(ff_dim_t(2)).is_replica_dim = true;
-    output_shape.at(ff_dim_t(3)).degree = 1;
-    output_shape.at(ff_dim_t(3)).is_replica_dim = false;
-  } else if (input.at(ff_dim_t(2)).degree == 1 &&
-             input.at(ff_dim_t(3)).degree > 1) {
-    // case 3: input [N, C, H, W/X] output [N, C, 1, 1], degree is X
-    // input [N, C, H, W/X] output [N, C, output_height, output_width/x], degree
-    // is X
-    output_shape.at(ff_dim_t(2)).degree = 1;
-    output_shape.at(ff_dim_t(2)).is_replica_dim = false;
-    output_shape.at(ff_dim_t(3)).degree = input.at(ff_dim_t(3)).degree;
-    output_shape.at(ff_dim_t(3)).is_replica_dim = true;
-  } else if (input.at(ff_dim_t(2)).degree > 1 &&
-             input.at(ff_dim_t(3)).degree > 1) {
-    // case 4: input [N, C, H/X, W/Y] output [N, C, 1, 1], degree is X and Y for
-    // avgpool2d input [N, C, H/X, W/Y] output [N, C, output_height/x,
-    // output_width/y], degree is X and Y for maxpool2d
-    for (int i = 2; i < input.num_dims(); i++) {
-      output_shape.at(ff_dim_t(i)).is_replica_dim = true;
-      output_shape.at(ff_dim_t(i)).degree = input.at(ff_dim_t(i)).degree;
-    }
-  } else {
-    throw mk_runtime_error("Pool2DAttrs: degree is not supported");
-  }
-
-  return output_shape;
 }
 
 /* ParallelTensorShape Pool2DAttrs::calculate_output_shape(ParallelTensorShape
diff --git a/lib/op-attrs/src/topk.cc b/lib/op-attrs/src/topk.cc
index 73bf59b048..8c1d043a57 100644
--- a/lib/op-attrs/src/topk.cc
+++ b/lib/op-attrs/src/topk.cc
@@ -15,8 +15,7 @@ ParallelTensorShape get_output_shape(TopKAttrs const &attrs,
   output.at(ff_dim_t(attrs.axis)).size = attrs.k;
   output.at(ff_dim_t(attrs.axis)).degree =
       input.at(ff_dim_t(attrs.axis)).degree;
-  output.at(ff_dim_t(attrs.axis)).is_replica_dim =
-      input.at(ff_dim_t(attrs.axis)).degree > 1;
+  output.at(ff_dim_t(attrs.axis)).is_replica_dim = false;
   return output;
 }
 

From 9f49b959002f392168a048d07b3155797335963b Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sat, 28 Oct 2023 01:49:44 +0000
Subject: [PATCH 64/69] add reduce

---
 lib/op-attrs/src/reduce.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/lib/op-attrs/src/reduce.cc b/lib/op-attrs/src/reduce.cc
index 79fa6d7598..b28c722268 100644
--- a/lib/op-attrs/src/reduce.cc
+++ b/lib/op-attrs/src/reduce.cc
@@ -1,14 +1,22 @@
 #include "op-attrs/ops/reduce.h"
-#include "utils/exceptions.h"
+#include "utils/exception.decl.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
 //
 ParallelTensorShape get_output_shape(ReduceAttrs const &attrs,
                                      ParallelTensorShape const &input) {
-  NOT_IMPLEMENTED()
-  // reduce is sum/max/min/mean, I think we just return 1D tensor [1, 2, 4] =>
-  // [7, ] NOTE: how to implement this
+  if (input.num_dims() - attrs.axes.size() == 1) {
+    throw mk_runtime_error(" for reduce, the input and attrs.axes must match");
+  }
+  ParallelTensorShape output = input;
+  for (int i = 0; i < attrs.axes.size(); i++) {
+    output.at(attrs.axes.at(i)).size = 1;
+    output.at(attrs.axes.at(i)).is_replica_dim = false;
+  }
+
+  return output;
 }
 
 } // namespace FlexFlow

From a715cdf9eae26ef492f9d89cc90c92ac8de2632a Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sat, 28 Oct 2023 02:07:51 +0000
Subject: [PATCH 65/69] implement the reshape

---
 lib/op-attrs/src/reduction.cc   |  3 +-
 lib/op-attrs/src/repartition.cc |  3 +-
 lib/op-attrs/src/replicate.cc   |  6 ++--
 lib/op-attrs/src/reshape.cc     | 52 ++++++++++++++++++++-------------
 4 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/lib/op-attrs/src/reduction.cc b/lib/op-attrs/src/reduction.cc
index 6336e15253..48455bd706 100644
--- a/lib/op-attrs/src/reduction.cc
+++ b/lib/op-attrs/src/reduction.cc
@@ -13,8 +13,7 @@ namespace FlexFlow {
 ParallelTensorShape get_output_shape(ReductionAttrs const &attrs,
                                      ParallelTensorShape const &input_shape) {
   ParallelTensorShape output(input_shape.dims, input_shape.data_type);
-  output.at(attrs.reduction_dim).degree /= attrs.reduction_degree;
-  output.at(attrs.reduction_dim).size /= attrs.reduction_degree;
+  output.at(attrs.reduction_dim).size = 1;
   return output;
 }
 
diff --git a/lib/op-attrs/src/repartition.cc b/lib/op-attrs/src/repartition.cc
index d7807fcc10..3046cf1ca7 100644
--- a/lib/op-attrs/src/repartition.cc
+++ b/lib/op-attrs/src/repartition.cc
@@ -14,7 +14,8 @@ ParallelTensorShape get_output_shape(RepartitionAttrs const &attrs,
                            "attrs.repartition_degree * dim.degree != 0");
   }
   ParallelTensorShape output(input.dims, input.data_type);
-  output.at(attrs.repartition_dim).degree *= attrs.repartition_degree;
+  output.at(attrs.repartition_dim).degree *=
+      attrs.repartition_degree; // NOTE: this may have some problem
   return output;
 }
 
diff --git a/lib/op-attrs/src/replicate.cc b/lib/op-attrs/src/replicate.cc
index 7a1b511a5e..b3c4e8e970 100644
--- a/lib/op-attrs/src/replicate.cc
+++ b/lib/op-attrs/src/replicate.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/replicate.h"
+#include "op-attrs/ff_dim.h"
 #include "op-attrs/parallel_dim.h"
 #include "utils/exception.h"
 
@@ -19,9 +20,8 @@ ParallelTensorShape get_output_shape(ReplicateAttrs const &attrs,
                            "range or input is invalid");
   }
   ParallelTensorShape output = input;
-  output.at(attrs.replicate_dim).size *= attrs.replicate_degree;
-  output.at(attrs.replicate_dim).is_replica_dim =
-      (input.at(attrs.replicate_dim).degree > 1);
+  output.at(ff_dim_t(0)).is_replica_dim = true;
+  output.at(ff_dim_t(0)).size *= attrs.replicate_degree;
   return output;
 }
 
diff --git a/lib/op-attrs/src/reshape.cc b/lib/op-attrs/src/reshape.cc
index b7e887002a..1cbbd36863 100644
--- a/lib/op-attrs/src/reshape.cc
+++ b/lib/op-attrs/src/reshape.cc
@@ -1,17 +1,15 @@
 #include "op-attrs/ops/reshape.h"
 #include "op-attrs/ff_dim.h"
+#include "op-attrs/parallel_tensor_shape.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
 
 // https://pytorch.org/docs/stable/generated/torch.reshape.html
-//  pytorch: the input: [2,3,4], shape maybe [-1,6]， should we add this? and
-//  the output is [4, 6] currently we doesn't consider the case of -1,we can
-//  support this later the input:[2,3,4], attrs.shape:[4,6], the output is [4,
-//  6]
 ParallelTensorShape get_output_shape(ReshapeAttrs const &attrs,
                                      ParallelTensorShape const &input) {
-  std::size_t input_volume = input.dims.get_volume();
+  std::size_t input_volume =
+      input.dims.get_volume() / input.at(ff_dim_t(0)).size;
   std::size_t attrs_volume = 1;
   for (int i = 0; i < attrs.shape.dims.num_dims(); i++) {
     attrs_volume *= attrs.shape.at(ff_dim_t(i));
@@ -20,28 +18,42 @@ ParallelTensorShape get_output_shape(ReshapeAttrs const &attrs,
     throw mk_runtime_error("ReshapeAttrs: input_volume != attrs_volume");
   }
 
-  ParallelTensorShape output = input;
-  output.data_type = input.data_type;
+  std::vector<ParallelDim> data;
+
   if (attrs.shape.dims.num_dims() == 1) {
     // infer the shape
     if (attrs.shape.at(ff_dim_t(0)) == -1) {
-
-      output.at(ff_dim_t(0)).size = input_volume;
-      output.at(ff_dim_t(0)).degree = 1;
-      output.at(ff_dim_t(0)).is_replica_dim = false;
+      // the output shape will be (<r, d1 , t>, <input_volume, d2, f>)
+      data.resize(2);
+      data[0] = input.at(ff_dim_t(0));
+      data[1].size = input_volume;
+      // how to decide the degree?
+      ParallelTensorShape output = ParallelTensorShape(
+          ParallelTensorDims(TensorDims(data.begin(), data.end())),
+          input.data_type);
+      return output;
     } else {
-      output.at(ff_dim_t(0)).size = attrs.shape.at(ff_dim_t(0));
-      output.at(ff_dim_t(1)).size = input_volume / attrs.shape.at(ff_dim_t(0));
-      for (int i = 0; i < 2; i++) {
-        output.at(ff_dim_t(i)).degree = 1;
-        output.at(ff_dim_t(i)).is_replica_dim = false;
+      // i = attrs.shape.at(ff_dim_t(0)
+      // the output shape will be (<r, d1 , t>, <i,_, f>, <input_volume /i , _,
+      // f>)
+      data.resize(3);
+      data[0] = input.at(ff_dim_t(0));
+      data[1].size = attrs.shape.at(ff_dim_t(0));
+      data[2].size = input_volume / attrs.shape.at(ff_dim_t(0));
+      for (int i = 1; i < 3; i++) {
+        // how to decide the degree?
+        data[i].is_replica_dim = false;
       }
+      ParallelTensorShape output = ParallelTensorShape(
+          ParallelTensorDims(TensorDims(data.begin(), data.end())),
+          input.data_type);
+      return output;
     }
-  } else {
-    ParallelTensorDims dims{attrs.shape.dims};
-    output = {dims, input.data_type};
-    // Note: I think reshape doesn't need to consider the degree
   }
+
+  ParallelTensorDims dims{attrs.shape.dims};
+  ParallelTensorShape output = {dims, input.data_type};
+
   return output;
 }
 

From 326f7f3d5971a2e3ee477e5641d5d657f8acd46e Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sat, 28 Oct 2023 02:24:11 +0000
Subject: [PATCH 66/69] implement the split

---
 lib/op-attrs/include/op-attrs/ops/flat.h   |  3 +++
 lib/op-attrs/include/op-attrs/ops/gather.h |  6 +++---
 lib/op-attrs/include/op-attrs/ops/split.h  |  4 ++--
 lib/op-attrs/src/reverse.cc                |  1 -
 lib/op-attrs/src/split.cc                  | 18 ++++++++++++++++--
 5 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/lib/op-attrs/include/op-attrs/ops/flat.h b/lib/op-attrs/include/op-attrs/ops/flat.h
index 706689199d..88b0a6cb54 100644
--- a/lib/op-attrs/include/op-attrs/ops/flat.h
+++ b/lib/op-attrs/include/op-attrs/ops/flat.h
@@ -11,6 +11,9 @@ struct FlatAttrs {};
 FF_VISITABLE_STRUCT(FlatAttrs);
 CHECK_VALID_OP_ATTR(FlatAttrs);
 
+ParallelTensorShape get_output_shape(FlatAttrs const &attrs,
+                                     ParallelTensorShape const &input);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/gather.h b/lib/op-attrs/include/op-attrs/ops/gather.h
index 852dc9cd5e..ad97e52556 100644
--- a/lib/op-attrs/include/op-attrs/ops/gather.h
+++ b/lib/op-attrs/include/op-attrs/ops/gather.h
@@ -14,9 +14,9 @@ struct GatherAttrs {
 FF_VISITABLE_STRUCT(GatherAttrs, dim);
 CHECK_VALID_OP_ATTR(GatherAttrs);
 
-std::vector<ParallelTensorShape> get_output_shapes(GatherAttrs const &,
-                                                   ParallelTensorShape const &,
-                                                   ParallelTensorShape const &);
+std::vector<ParallelTensorShape> get_output_shape(GatherAttrs const &,
+                                                  ParallelTensorShape const &,
+                                                  ParallelTensorShape const &);
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/split.h b/lib/op-attrs/include/op-attrs/ops/split.h
index 14f9395a26..f2f904a9f7 100644
--- a/lib/op-attrs/include/op-attrs/ops/split.h
+++ b/lib/op-attrs/include/op-attrs/ops/split.h
@@ -13,8 +13,8 @@ struct SplitAttrs {
 };
 FF_VISITABLE_STRUCT(SplitAttrs, splits, axis);
 CHECK_VALID_OP_ATTR(SplitAttrs);
-std::vector<ParallelTensorShape> get_output_shapes(SplitAttrs const &,
-                                                   ParallelTensorShape const &);
+std::vector<ParallelTensorShape> get_output_shape(SplitAttrs const &,
+                                                  ParallelTensorShape const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/src/reverse.cc b/lib/op-attrs/src/reverse.cc
index 663150861f..f79fcfd0ed 100644
--- a/lib/op-attrs/src/reverse.cc
+++ b/lib/op-attrs/src/reverse.cc
@@ -9,7 +9,6 @@ ParallelTensorShape get_output_shape(ReverseAttrs const &attrs,
   if (attrs.axis < 0 || attrs.axis >= input_shape.num_dims()) {
     throw mk_runtime_error("ReverseAttrs: axis is invalid");
   }
-  // output degree is same as input degree, because it's just reverse operation
   return input_shape;
 }
 
diff --git a/lib/op-attrs/src/split.cc b/lib/op-attrs/src/split.cc
index 5c6f3d6924..8ab083c46b 100644
--- a/lib/op-attrs/src/split.cc
+++ b/lib/op-attrs/src/split.cc
@@ -21,8 +21,22 @@ std::vector<ParallelTensorShape>
     outputs.back().at(ff_dim_t(attrs.axis)).size = attrs.splits[i];
     outputs.back().at(ff_dim_t(attrs.axis)).degree =
         input.at(ff_dim_t(attrs.axis)).degree;
-    outputs.back().at(ff_dim_t(attrs.axis)).is_replica_dim =
-        input.at(ff_dim_t(attrs.axis)).degree > 1;
+    outputs.back().at(ff_dim_t(attrs.axis)).is_replica_dim = attrs.axis == 0;
+  }
+  return outputs;
+}
+
+std::vector<ParallelTensorShape>
+    get_output_shape(SplitAttrs const &attrs,
+                     ParallelTensorShape const &input) {
+  std::size_t dims_sum = sum(attrs.splits);
+  if (dims_sum != input.at(ff_dim_t(attrs.axis)).size) {
+    throw mk_runtime_error(
+        "SplitAttrs: dims_sum != input.at(ff_dim_t(attrs.axis)).size");
+  }
+
+  std::vector<ParallelTensorShape> outputs;
+  for (std::size_t i = 0; i < attrs.splits.size(); ++i) {
   }
   return outputs;
 }

From 6c824660f0c6f5d687f28e272e3098853923d7dc Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sat, 28 Oct 2023 02:28:21 +0000
Subject: [PATCH 67/69] add transpose

---
 lib/op-attrs/src/transpose.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/op-attrs/src/transpose.cc b/lib/op-attrs/src/transpose.cc
index 88772b72e0..ac853bd0ed 100644
--- a/lib/op-attrs/src/transpose.cc
+++ b/lib/op-attrs/src/transpose.cc
@@ -4,18 +4,20 @@
 
 namespace FlexFlow {
 
-// assume we have [x, y, z, l], perms is [0,2] we return [z, y, x, l]
+// assume input:[<ri, di1, t>, <x,di2, f> , <y, di3, f>, <z, di4, f>]
+// perem is [1,2]
+// output:[<ri, di1, t>,  <y, di3, f>, <x,di2, f>, <z, di4, f> ]
 ParallelTensorShape get_output_shape(TransposeAttrs const &attrs,
                                      ParallelTensorShape const &input) {
   if (attrs.perm.size() != 2) {
     throw mk_runtime_error("TransposeAttrs: perm.size() != 2");
   }
 
-  auto dim0 = attrs.perm[0];
+  auto dim0 = attrs.perm[0]; // dim0 and dim1 should not be 0
   auto dim1 = attrs.perm[1];
-  if (dim0 < 0 || dim1 < 0 || dim0 >= input.num_dims() ||
+  if (dim0 <= 0 || dim1 <= 0 || dim0 >= input.num_dims() ||
       dim1 >= input.num_dims()) {
-    throw mk_runtime_error("TransposeAttrs: dim0 < 0 || dim1 < 0 || dim0 >= "
+    throw mk_runtime_error("TransposeAttrs: dim0 <= 0 || dim1 <= 0 || dim0 >= "
                            "input.num_dims() || dim1 >= input.num_dims()");
   }
 
@@ -26,10 +28,8 @@ ParallelTensorShape get_output_shape(TransposeAttrs const &attrs,
   output.at(ff_dim_t(dim1)).size = temp;
   output.at(ff_dim_t(dim0)).degree = input.at(ff_dim_t(dim1)).degree;
   output.at(ff_dim_t(dim1)).degree = degree;
-  output.at(ff_dim_t(dim0)).is_replica_dim =
-      output.at(ff_dim_t(dim0)).degree > 1;
-  output.at(ff_dim_t(dim1)).is_replica_dim =
-      output.at(ff_dim_t(dim1)).degree > 1;
+  output.at(ff_dim_t(dim0)).is_replica_dim = dim0 == 0;
+  output.at(ff_dim_t(dim1)).is_replica_dim = dim1 == 0;
   return output;
 }
 

From ce7ba69830840403fa1eaaf2cd443525e76c1ba2 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sat, 28 Oct 2023 02:55:08 +0000
Subject: [PATCH 68/69]  leave the attention

---
 lib/op-attrs/src/attention.cc | 88 +++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index cc6d9c3c48..717487d1db 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -1,5 +1,6 @@
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "utils/exception.decl.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -122,6 +123,93 @@ ParallelTensorShape get_output_shape(
   return output;
 }
 
+// https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html,
+// we consider the batch size
+// query/key/value: 4D dimensions
+//query:[<rq, dq0, t>, <seq_len, dq1, f>, <b, dq2, f>, <embed_dim, dq3, f>]
+
+// key:[<rk, dk0, t>, <seq_len, dk1, f>, <b, dk2,f>, <embed_dim, dk3, f>]
+
+// value:[<rv, dv0, t>, <seq_len, dv1, f>, <b, dv3,f>, <embed_dim, dv3, f>]
+//  multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+
+
+// output: (seq_len, batch_size, embed_dim)
+
+//output: [<ro, do0, t>, <seq_len, do1, f>, <b, do2, f>, <embed_dim, do3, f>]
+
+
+//how to decide the ro/do0?  ro = rq * dq1 * dq12 * dq3 / do1/do2/do3
+
+//how to decide the do1 or do2 or do3?
+//ro / do0  = dq / dq0 
+
+ // k->(<<rk, dk0, t>, <seq_len, dk1, f>,<num_head, dk3, f>, <b, dk2,f>,, <kdim, dk3,f> ) //num_head * kdim = embed_dim 
+
+// v->(<<rv, dv0, t>, <seq_len, dv1, f>,<num_head, dv3, f>, <b, dv2,f>,, <vdim, dv3,f> ) //num_head * vdim = embed_dim 
+
+// q->(<<rq, dq0, t>, <seq_len, dq1, f>, <num_head, dq3, f>. <b, dq2,f>,, <kdim, dq3,f> ) //num_head * kdim = embed_dim 
+
+//  // attn = q @k (<ra11, da11, t>, <seq_len, da12, f>, <b, da13,f>,  <b, da14,f>)
+
+//how to decide the ra11/da11/da12/da13/da14?
+
+//rk * dk2 * dk3 = rv * dv2 * dv3 , dk3 = dv3, 
+
+//so da13 = dk2, da14 = dv2, ra11 = rk * dk2 * dk3 / (da13 * da14) = rk * dk3 / dv2 = rv 
+
+//da11 = rk / dk0 / ra11
+
+//attn: (<rv, rk / dk0 / rv, t>, <seq_len, da12, f>, <b, dk2,f>,  <b, dv2,f>)
+
+
+
+ParallelTensorShape get_output_shape(
+    MultiHeadAttentionAttrs const &attrs,
+    MultiHeadAttentionInputs<ParallelTensorShape> const &input) {
+
+  if (input.query.num_dims() != 4 || input.key.num_dims() != 4 ||
+      input.value.num_dims() != 4) {
+    throw mk_runtime_error("MultiHeadAttentionAttrs: num_dims != 4");
+  }
+
+  
+  if (input.query.at(ff_dim_t(1)).size != input.key.at(ff_dim_t(1)).size ||
+      input.query.at(ff_dim_t(1)).size != input.value.at(ff_dim_t(1)).size ||
+      input.key.at(ff_dim_t(1)).size != input.value.at(ff_dim_t(1)).size) {
+    throw mk_runtime_error("MultiHeadAttentionAttrs: seq_len not match");
+  }
+
+  if (input.query.at(ff_dim_t(2)).size != input.key.at(ff_dim_t(2)).size ||
+      input.query.at(ff_dim_t(2)).size != input.value.at(ff_dim_t(2)).size ||
+      input.key.at(ff_dim_t(2)).size != input.value.at(ff_dim_t(2)).size) {
+    throw mk_runtime_error("MultiHeadAttentionAttrs: batch_size not match");
+  }
+
+  if (input.query.at(ff_dim_t(3)).size != input.key.at(ff_dim_t(3)).size ||
+      input.query.at(ff_dim_t(3)).size != input.value.at(ff_dim_t(3)).size ||
+      input.key.at(ff_dim_t(3)).size != input.value.at(ff_dim_t(3)).size) {
+    throw mk_runtime_error("MultiHeadAttentionAttrs:  embed_dim not match");
+  }
+
+  if (input.query.at(ff_dim_t(3)).size != attrs.embed_dim ||
+      input.key.at(ff_dim_t(3)).size != attrs.embed_dim ||
+      input.value.at(ff_dim_t(3)).size != attrs.embed_dim) {
+    throw mk_runtime_error(
+        "MultiHeadAttentionAttrs:  input's embed_dim not match to attrs");
+  }
+
+  if (attrs.embed_dim != (attrs.num_heads * attrs.kdim)) {
+    throw mk_runtime_error(
+        "MultiHeadAttentionAttrs:  embed_dim not match to num_heads * kdim");
+  }
+
+
+  NOT_IMPLEMENTED();
+
+}
+
+
 } // namespace FlexFlow
 
 // Tensor FFModel::multihead_attention(const Tensor query,

From fec4928ea094b327724b0ac7b2ace4f9450cd704 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sat, 28 Oct 2023 23:54:21 +0000
Subject: [PATCH 69/69] add attention

---
 lib/op-attrs/src/attention.cc | 82 +++++++++++++++++++++++++++--------
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/lib/op-attrs/src/attention.cc b/lib/op-attrs/src/attention.cc
index 717487d1db..fb9ab0cd29 100644
--- a/lib/op-attrs/src/attention.cc
+++ b/lib/op-attrs/src/attention.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/attention.h"
+#include "op-attrs/ff_dim.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "utils/exception.decl.h"
 #include "utils/exception.h"
@@ -111,7 +112,7 @@ ParallelTensorShape get_output_shape(
   // attn = q @k  (seq_len, num_head, batch_size, batch_size)
   // attn = attn @v (seq_len, num_head, batch_size, vdim)
   // attn = attn.transpose(1,2) (seq_len, batch_size, num_head, vdim)
-  // attn = attn.reshape(seq_len, batch_size, num_head*vdim)
+  //
 
   // Note: we support tensor parallelism for seq_len/batch_size/embed_dim
   ParallelTensorShape output = input.query;
@@ -126,43 +127,78 @@ ParallelTensorShape get_output_shape(
 // https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html,
 // we consider the batch size
 // query/key/value: 4D dimensions
-//query:[<rq, dq0, t>, <seq_len, dq1, f>, <b, dq2, f>, <embed_dim, dq3, f>]
+// query:[<rq, dq0, t>, <seq_len, dq1, f>, <b, dq2, f>, <embed_dim, dq3, f>]
 
 // key:[<rk, dk0, t>, <seq_len, dk1, f>, <b, dk2,f>, <embed_dim, dk3, f>]
 
 // value:[<rv, dv0, t>, <seq_len, dv1, f>, <b, dv3,f>, <embed_dim, dv3, f>]
 //  multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
 
+// ### k:(<<rk, dk0, t>, <seq_len, dk1, f>,, <b, dk3,f>,, <embed_dim, dk4,f> )
 
-// output: (seq_len, batch_size, embed_dim)
+// k->(<<rk, dk0, t>, <seq_len, dk1, f>,<num_head, dk2, f>, <b, dk3,f>,, <kdim,
+// dk4,f> ) //num_head * kdim = embed_dim , dk2 = dk4
+
+// v->(<<rv, dv0, t>, <seq_len, dv1, f>,<num_head, dv2, f>, <b, dv4,f>,, <vdim,
+// dv4,f> ) //num_head * vdim = embed_dim , dv2 = dv4
+
+// q->(<<rq, dq0, t>, <seq_len, dq1, f>, <num_head, dq2, f>. <b, dq3,f>,, <kdim,
+// dq4,f> ) //num_head * kdim = embed_dim , dq2 = dq4
+
+// we have dk1 = dv1 = dq1 dk2 = dk4=dv2=dv4=dq2=dq4
+
+// 1)/ attn = q @k (<ra11, da10, t>, <seq_len, da11, f>, <num_head, da12, f>,
+// <b, da13,f>,  <b, da14,f>)
+
+// how to decide the ra11//da10/da11/da12/da13/da14? ⇒ I think da11 =dk1, da12 =
+// dk2, da13. = dk3, da14 = dq3
 
-//output: [<ro, do0, t>, <seq_len, do1, f>, <b, do2, f>, <embed_dim, do3, f>]
+// rk * dk3 * dk4=rq * dq3 * dq4 = ra11 * da13 * da14 = ra11 * dk3 * dq3
 
+// => ra11 = (rk * dk4) / dq3 = (rq * dq4) / dk3 , ra11/da10 = rq / dq0,
 
-//how to decide the ro/do0?  ro = rq * dq1 * dq12 * dq3 / do1/do2/do3
+// =>da10 =  ra11 * dq0 / rq = dq0 * dq4 / dk3
 
-//how to decide the do1 or do2 or do3?
-//ro / do0  = dq / dq0 
+// output attn: (< (rq * dq4) / dk3, dq0 * dq4 / dk3, t>, <seq_len, dk1, f>,
+// <num_head, dk2, f>, <b, dk3, f>,  <b, dq3,f>)
 
- // k->(<<rk, dk0, t>, <seq_len, dk1, f>,<num_head, dk3, f>, <b, dk2,f>,, <kdim, dk3,f> ) //num_head * kdim = embed_dim 
+// 2)attn = attn @v (seq_len, num_head, batch_size, vdim)
 
-// v->(<<rv, dv0, t>, <seq_len, dv1, f>,<num_head, dv3, f>, <b, dv2,f>,, <vdim, dv3,f> ) //num_head * vdim = embed_dim 
+// input attn:(< (rq * dq4) / dk3, dq0 * dq4 / dk3, t>, <seq_len, dk1, f>,
+// <num_head, dk2, f>, <b, dk3, f>,  <b, dq3,f>)
 
-// q->(<<rq, dq0, t>, <seq_len, dq1, f>, <num_head, dq3, f>. <b, dq2,f>,, <kdim, dq3,f> ) //num_head * kdim = embed_dim 
+// input v: ((<<rv, dv0, t>, <seq_len, dv1, f>,  <num_head, dv2, f>, <b,
+// dv3,f>,, <vdim, dv4,f> ) //num_head * vdim = embed_dim
 
-//  // attn = q @k (<ra11, da11, t>, <seq_len, da12, f>, <b, da13,f>,  <b, da14,f>)
+// output attn:(<ra21, da20, t>, <seq_len, da21,f>,  <num_head, da22, f>, <b,
+// da23, f>, <vdim, da24,f>
 
-//how to decide the ra11/da11/da12/da13/da14?
+// how to decide ra21//da20/da21/da22/da23/da24? ⇒ da21 = dk1, da22 =  dk2, da23
+// = dk3, da24 = dv4
 
-//rk * dk2 * dk3 = rv * dv2 * dv3 , dk3 = dv3, 
+// ra21 * da23 * da24 = rv * dv3 * dv4 ⇒ ra21 = (rv * dv3) / dk3
 
-//so da13 = dk2, da14 = dv2, ra11 = rk * dk2 * dk3 / (da13 * da14) = rk * dk3 / dv2 = rv 
+// ra21 / da20 = (rq * dq4) / dk3 / (dq0 * dq4 / dk3) ⇒ da20 = (rv * dv3 * dq0)
+// / (rq * dk3)
 
-//da11 = rk / dk0 / ra11
+// output attn:(<(rv * dv3) / dk3 , (rv * dv3 * dq0) / (rq * dk3), t>, <seq_len,
+// dk1,f>,  <num_head, dk2, f>, <b, dk3 f>, <vdim, dv4,f>)
 
-//attn: (<rv, rk / dk0 / rv, t>, <seq_len, da12, f>, <b, dk2,f>,  <b, dv2,f>)
+// 3) attn = attn.transpose(1,2 ) (seq_len, batch_size, num_head, vdim)
 
+// input attn:(<(rv * dv3) / dk3 , (rv * dv3 * dq0) / (rq * dk3), t>, <seq_len,
+// dk1,f>,  <num_head, dk2, f>, <b, dk3 f>, <vdim, dv4,f>
 
+// output attn:(<(rv * dv3) / dk3 , (rv * dv3 * dq0) / (rq * dk3), t>, <seq_len,
+// dk1,f>,  <b, dk3 f> <num_head, dk2, f>, , <vdim, dv4,f>
+
+// 4)attn = attn.reshape(seq_len, batch_size, num_head*vdim)
+
+// input attn:(<(rv * dv3) / dk3 , (rv * dv3 * dq0) / (rq * dk3), t>, <seq_len,
+// dk1,f>,  <b, dk3 f> <num_head, dk2, f>, , <vdim, dv4,f>
+
+// output attn:(<(rv * dv3) / dk3 , (rv * dv3 * dq0) / (rq * dk3), t>, <seq_len,
+// dk1,f>,  <b, dk3 f> <num_head, dk2, f>, , <vdim, dv4,f>
 
 ParallelTensorShape get_output_shape(
     MultiHeadAttentionAttrs const &attrs,
@@ -173,7 +209,6 @@ ParallelTensorShape get_output_shape(
     throw mk_runtime_error("MultiHeadAttentionAttrs: num_dims != 4");
   }
 
-  
   if (input.query.at(ff_dim_t(1)).size != input.key.at(ff_dim_t(1)).size ||
       input.query.at(ff_dim_t(1)).size != input.value.at(ff_dim_t(1)).size ||
       input.key.at(ff_dim_t(1)).size != input.value.at(ff_dim_t(1)).size) {
@@ -204,12 +239,21 @@ ParallelTensorShape get_output_shape(
         "MultiHeadAttentionAttrs:  embed_dim not match to num_heads * kdim");
   }
 
+  ParallelTensorShape output = input.key;
 
-  NOT_IMPLEMENTED();
+  output.at(ff_dim_t(0)).size =
+      (input.value.at(ff_dim_t(0)).size * input.value.at(ff_dim_t(2)).degree) /
+      input.key.at(ff_dim_t(2)).degree; // rv3 * dv3 / dk3
+  output.at(ff_dim_t(0)).degree =
+      (input.value.at(ff_dim_t(0)).size * input.value.at(ff_dim_t(2)).degree *
+       input.query.at(ff_dim_t(0)).degree) /
+      (input.query.at(ff_dim_t(0)).size * input.key.at(ff_dim_t(2)).degree);
+  // (rv * dv3 * dq0) / (rq * dk3)
+  output.at(ff_dim_t(0)).is_replica_dim = true;
 
+  return output;
 }
 
-
 } // namespace FlexFlow
 
 // Tensor FFModel::multihead_attention(const Tensor query,