amd
diff --git a/‎aie_kernels/aie2p/softmax.cc‎
Lines changed: 8 additions & 0 deletions b/‎aie_kernels/aie2p/softmax.cc‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎aie_kernels/generic/mv.cc‎
Lines changed: 11 additions & 14 deletions b/‎aie_kernels/generic/mv.cc‎
Lines changed: 11 additions & 14 deletions
diff --git a/‎conftest.py‎
Lines changed: 3 additions & 1 deletion b/‎conftest.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎iron/common/__init__.py‎
Lines changed: 11 additions & 3 deletions b/‎iron/common/__init__.py‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎iron/common/aie_base.py‎
Lines changed: 0 additions & 229 deletions b/‎iron/common/aie_base.py‎
Lines changed: 0 additions & 229 deletions
@@ -177,4 +177,12 @@ void partial_softmax_bf16(bfloat16 *restrict input,
     partial_softmax_alias_bf16(input, output, scale_buffer, input_size, row_idx, num_rows, scale);
 }
 
+void mask_bf16(bfloat16 *inout, const int32 unmasked_size, const int32 total_size)
+{
+    // TODO: Optimize this to use vector code
+    for (int32 i = unmasked_size; i < total_size; i++) {
+        inout[i] = (bfloat16)(-INFINITY);
+    }
+}
+
 } // extern "C"
@@ -15,6 +15,10 @@
 
 #include <aie_api/aie.hpp>
 
+#ifndef VEC_SIZE
+#define VEC_SIZE 64
+#endif
+
 void matvec_scalar(uint32_t m,
                    uint32_t k,
                    const bfloat16 *__restrict a,
@@ -40,22 +44,17 @@ Matrix-vector multiplication kernel
  - c: Pointer to the output vector
  - r: Vector size; data from the matrix and vector will be loaded in and processed in chunks of this size
 */
-template <uint32_t r>
-void matvec_vectorized(uint32_t m,
-                       uint32_t k,
-                       const bfloat16 *__restrict a,
-                       const bfloat16 *__restrict b,
-                       bfloat16 *__restrict c)
+template <uint32_t r, uint32_t k>
+void matvec_vectorized(uint32_t m, const bfloat16 *__restrict a, const bfloat16 *__restrict b, bfloat16 *__restrict c)
 {
     ::aie::set_rounding(aie::rounding_mode::conv_even);
     bfloat16 *c_end = c + m;
     const bfloat16 *b_end = b + k;
     for (; c < c_end; c++) {
         aie::accum acc = aie::zeros<accfloat, r>();
-        // The following two pragmas enable pipelining the zero-overhead loop, but they do assume that k is at least
-        // two. This assumption should hold for any useful use of this function; if k were one, this would be a simple
-        // scalar multiplication of a vector.
-        AIE_LOOP_MIN_ITERATION_COUNT(2)
+        // The following two pragmas enable pipelining the zero-overhead loop, but they do assume that there are at
+        // least two iterations of the loop, i.e. k >= 2*r. This pragma will break the code if that is not the case!
+        AIE_LOOP_MIN_ITERATION_COUNT(k / VEC_SIZE)
         for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
             aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
             aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
@@ -72,25 +71,23 @@ extern "C" {
  * `c`.  */
 
 void matvec_scalar_bf16_bf16(uint32_t m,
-                             uint32_t k,
                              uint32_t row_offset,
                              const bfloat16 *__restrict a_in,
                              const bfloat16 *__restrict b_in,
                              bfloat16 *__restrict c_out)
 {
     c_out += row_offset;
-    matvec_scalar(m, k, a_in, b_in, c_out);
+    matvec_scalar(m, DIM_K, a_in, b_in, c_out);
 }
 
 void matvec_vectorized_bf16_bf16(uint32_t m,
-                                 uint32_t k,
                                  uint32_t row_offset,
                                  const bfloat16 *__restrict a_in,
                                  const bfloat16 *__restrict b_in,
                                  bfloat16 *__restrict c_out)
 {
     c_out += row_offset;
-    matvec_vectorized<64>(m, k, a_in, b_in, c_out);
+    matvec_vectorized<VEC_SIZE, DIM_K>(m, a_in, b_in, c_out);
 }
 
 } // extern "C"
@@ -16,7 +16,9 @@
 @pytest.fixture
 def aie_context():
     """Create a fresh AIEContext for each test"""
-    return AIEContext()
+    ctx = AIEContext()
+    yield ctx
+    ctx.device_manager.reset()
 
 
 def pytest_addoption(parser):
 
@@ -3,8 +3,16 @@
 
 """Common utilities and base classes for IRON operators."""
 
-from .aie_base import AIEOperatorBase, AIEOperatorConstraintError
-from .aie_context import AIEContext
+from .base import (
+    AIEOperatorBase,
+    MLIROperator,
+    CompositeOperator,
+    CompositeCallable,
+    AIEBuffer,
+    SingleXclbinCallable,
+    AIERuntimeArgSpec,
+)
+from .context import AIEContext
 from .compilation import (
     XclbinArtifact,
     InstsBinArtifact,
@@ -13,4 +21,4 @@
     SourceArtifact,
     PythonGeneratedMLIRArtifact,
 )
-from .aie_device_manager import AIEDeviceManager
+from .device_manager import AIEDeviceManager