From 4a3f6b9104fe776f35a3833a8c73977311f2f3f9 Mon Sep 17 00:00:00 2001
From: Shriram Shastry <malladi.sastry@intel.com>
Date: Thu, 13 Jun 2024 10:04:18 +0530
Subject: [PATCH 1/7] Math: Add Doxygen documentation for matrix multiplication

This patch introduces Doxygen-style documentation to the matrix
multiplication functions. Clear descriptions and parameter details
are provided to facilitate better understanding and ease of use.

Signed-off-by: Shriram Shastry <malladi.sastry@intel.com>
---
 src/math/matrix.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/math/matrix.c b/src/math/matrix.c
index e4c4c1ecbfff..c2c201f41692 100644
--- a/src/math/matrix.c
+++ b/src/math/matrix.c
@@ -3,11 +3,28 @@
 // Copyright(c) 2022 Intel Corporation. All rights reserved.
 //
 // Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+//	   Shriram Shastry <malladi.sastry@linux.intel.com>
+//
 
 #include <sof/math/matrix.h>
 #include <errno.h>
 #include <stdint.h>
 
+/**
+ * Description: Performs matrix multiplication of two fixed-point 16-bit integer matrices,
+ *	 storing the result in a third matrix. It accounts for fractional bits for
+ *	 fixed-point arithmetic, adjusting the result accordingly.
+ *
+ * Arguments:
+ *   a: pointer to the first input matrix
+ *   b: pointer to the second input matrix
+ *   c: pointer to the output matrix to store result
+ *
+ * Return:
+ *   0 on successful multiplication.
+ *   -EINVAL if input dimensions do not allow for multiplication.
+ *   -ERANGE if the shift operation might cause integer overflow.
+ */
 int mat_multiply(struct mat_matrix_16b *a, struct mat_matrix_16b *b, struct mat_matrix_16b *c)
 {
 	int64_t s;
@@ -58,6 +75,22 @@ int mat_multiply(struct mat_matrix_16b *a, struct mat_matrix_16b *b, struct mat_
 	return 0;
 }
 
+/**
+ * Description: Performs element-wise multiplication of two matrices with 16-bit integer elements
+ *		and stores the result in a third matrix. Checks that all matrices have the same
+ *		dimensions and adjusts for fractional bits appropriately. This operation handles
+ *		the manipulation of fixed-point precision based on the fractional bits present in
+ *		the matrices.
+ *
+ * Arguments:
+ *   a - pointer to the first input matrix
+ *   b - pointer to the second input matrix
+ *   c - pointer to the output matrix where the result will be stored
+ *
+ * Returns:
+ *   0 on successful multiplication,
+ *  -EINVAL if input pointers are NULL or matrix dimensions do not match.
+ */
 int mat_multiply_elementwise(struct mat_matrix_16b *a, struct mat_matrix_16b *b,
 			     struct mat_matrix_16b *c)
 {	int64_t p;

From 1ac1b5d1c811b91452c4c9a0aef868d634ecfe0a Mon Sep 17 00:00:00 2001
From: Shriram Shastry <malladi.sastry@intel.com>
Date: Thu, 13 Jun 2024 10:11:56 +0530
Subject: [PATCH 2/7] Math: Error Checking Enhancements

- Added checks for integer overflow during shifting.
- Validated matrix dimensions to prevent mismatches.
- Ensured non-null pointers before operating on matrices.

Signed-off-by: Shriram Shastry <malladi.sastry@intel.com>
---
 src/math/matrix.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/math/matrix.c b/src/math/matrix.c
index c2c201f41692..7025a365bb1f 100644
--- a/src/math/matrix.c
+++ b/src/math/matrix.c
@@ -27,6 +27,10 @@
  */
 int mat_multiply(struct mat_matrix_16b *a, struct mat_matrix_16b *b, struct mat_matrix_16b *c)
 {
+	/* Validate matrix dimensions are compatible for multiplication */
+	if (a->columns != b->rows || a->rows != c->rows || b->columns != c->columns)
+		return -EINVAL;
+
 	int64_t s;
 	int16_t *x;
 	int16_t *y;
@@ -35,8 +39,9 @@ int mat_multiply(struct mat_matrix_16b *a, struct mat_matrix_16b *b, struct mat_
 	int y_inc = b->columns;
 	const int shift_minus_one = a->fractions + b->fractions - c->fractions - 1;
 
-	if (a->columns != b->rows || a->rows != c->rows || b->columns != c->columns)
-		return -EINVAL;
+	/* Check shift to ensure no integer overflow occurs during shifting */
+	if (shift_minus_one < -1 || shift_minus_one > 31)
+		return -ERANGE;
 
 	/* If all data is Q0 */
 	if (shift_minus_one == -1) {
@@ -93,18 +98,21 @@ int mat_multiply(struct mat_matrix_16b *a, struct mat_matrix_16b *b, struct mat_
  */
 int mat_multiply_elementwise(struct mat_matrix_16b *a, struct mat_matrix_16b *b,
 			     struct mat_matrix_16b *c)
-{	int64_t p;
+{
+	/* Validate matrix dimensions and non-null pointers */
+	if (!a || !b || !c ||
+	    a->columns != b->columns || a->rows != b->rows ||
+	    c->columns != a->columns || c->rows != a->rows) {
+		return -EINVAL;
+	}
+
 	int16_t *x = a->data;
 	int16_t *y = b->data;
 	int16_t *z = c->data;
+	int64_t p;
 	int i;
 	const int shift_minus_one = a->fractions + b->fractions - c->fractions - 1;
 
-	if (a->columns != b->columns || b->columns != c->columns ||
-	    a->rows != b->rows || b->rows != c->rows) {
-		return -EINVAL;
-	}
-
 	/* If all data is Q0 */
 	if (shift_minus_one == -1) {
 		for (i = 0; i < a->rows * a->columns; i++) {

From 1f4f10af199e3ebb426bdb616c024d82b472ca49 Mon Sep 17 00:00:00 2001
From: Shriram Shastry <malladi.sastry@intel.com>
Date: Mon, 26 Aug 2024 17:10:01 +0530
Subject: [PATCH 3/7] Math: Change accumulator data type to int32_t for matrix
 multiplication

Changed the accumulator data type from `int64_t` to `int32_t` to reduce
instruction cycle count. This change results in an approximate 8.18% gain
in performance for matrix multiplication operations.

Performance Results:
Compiler Settings: -O2
+------------+------+------+--------+-----------+-----------+----------+
| Test Name  | Rows | Cols | Cycles | Max Error | RMS Error | Result   |
+------------+------+------+--------+-----------+-----------+----------+
| Test 1     | 3    | 5    | 6487   | 0.00      | 0.00      | Pass     |
| Test 2     | 6    | 8    | 6106   | 0.00      | 0.00      | Pass     |
+------------+------+------+--------+-----------+-----------+----------+

Signed-off-by: Shriram Shastry <malladi.sastry@intel.com>
---
 src/math/matrix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/math/matrix.c b/src/math/matrix.c
index 7025a365bb1f..509b7c168cec 100644
--- a/src/math/matrix.c
+++ b/src/math/matrix.c
@@ -31,7 +31,7 @@ int mat_multiply(struct mat_matrix_16b *a, struct mat_matrix_16b *b, struct mat_
 	if (a->columns != b->rows || a->rows != c->rows || b->columns != c->columns)
 		return -EINVAL;
 
-	int64_t s;
+	int32_t s;  /* Changed from int64_t to int32_t */
 	int16_t *x;
 	int16_t *y;
 	int16_t *z = c->data;

From 5ddb7c02cbc4d3abc458ec88d4a09ece479215d7 Mon Sep 17 00:00:00 2001
From: Shriram Shastry <malladi.sastry@intel.com>
Date: Mon, 26 Aug 2024 17:47:12 +0530
Subject: [PATCH 4/7] Math: Enhance pointer arithmetic in matrix multiplication

Enhanced pointer arithmetic within loops to improve readability and
reduce overhead. This change potentially reduces minor computational
overhead, contributing to overall performance improvements of around
8.23% for Test 1 and 16.00% for Test 2.

Performance Results:
Compiler Settings: -O3

+------------+------+------+--------+-----------+-----------+----------+
| Test Name  | Rows | Cols | Cycles | Max Error | RMS Error | Result   |
+------------+------+------+--------+-----------+-----------+----------+
| Test 1     | 3    | 5    | 5953   | 0.00      | 0.00      | Pass     |
| Test 2     | 6    | 8    | 5128   | 0.00      | 0.00      | Pass     |
+------------+------+------+--------+-----------+-----------+----------+

Signed-off-by: Shriram Shastry <malladi.sastry@intel.com>
---
 src/math/matrix.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/math/matrix.c b/src/math/matrix.c
index 509b7c168cec..093d10d7afaf 100644
--- a/src/math/matrix.c
+++ b/src/math/matrix.c
@@ -51,12 +51,12 @@ int mat_multiply(struct mat_matrix_16b *a, struct mat_matrix_16b *b, struct mat_
 				x = a->data + a->columns * i;
 				y = b->data + j;
 				for (k = 0; k < b->rows; k++) {
-					s += (int32_t)(*x) * (*y);
-					x++;
+					/* Enhanced pointer arithmetic */
+					s += (int32_t)(*x++) * (*y);
 					y += y_inc;
 				}
-				*z = (int16_t)s; /* For Q16.0 */
-				z++;
+				/* Enhanced pointer arithmetic */
+				*z++ = (int16_t)s;
 			}
 		}
 
@@ -69,12 +69,12 @@ int mat_multiply(struct mat_matrix_16b *a, struct mat_matrix_16b *b, struct mat_
 			x = a->data + a->columns * i;
 			y = b->data + j;
 			for (k = 0; k < b->rows; k++) {
-				s += (int32_t)(*x) * (*y);
-				x++;
+				/* Enhanced pointer arithmetic */
+				s += (int32_t)(*x++) * (*y);
 				y += y_inc;
 			}
-			*z = (int16_t)(((s >> shift_minus_one) + 1) >> 1); /*Shift to Qx.y */
-			z++;
+			/* Enhanced pointer arithmetic */
+			*z++ = (int16_t)(((s >> shift_minus_one) + 1) >> 1); /*Shift to Qx.y */
 		}
 	}
 	return 0;

From 53dbdefe460ca804ab1f374f028b832852649f67 Mon Sep 17 00:00:00 2001
From: Shriram Shastry <malladi.sastry@intel.com>
Date: Mon, 26 Aug 2024 18:19:27 +0530
Subject: [PATCH 5/7] Math: Update comments and apply cosmetic changes

Updated comments for better clarity and understanding. Made cosmetic
changes such as reformatting code and renaming variables to enhance
readability without impacting functionality. This resulted in
approximately 7.97% and 15.00% performance improvements for
Test 1 and Test 2, respectively.

Performance Results:
Compiler Settings: -O2

+------------+------+------+--------+-----------+-----------+----------+
| Test Name  | Rows | Cols | Cycles | Max Error | RMS Error | Result   |
+------------+------+------+--------+-----------+-----------+----------+
| Test 1     | 3    | 5    | 5975   | 0.00      | 0.00      | Pass     |
| Test 2     | 6    | 8    | 5192   | 0.00      | 0.00      | Pass     |
+------------+------+------+--------+-----------+-----------+----------+

Signed-off-by: Shriram Shastry <malladi.sastry@intel.com>
---
 src/math/matrix.c | 67 +++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 28 deletions(-)

diff --git a/src/math/matrix.c b/src/math/matrix.c
index 093d10d7afaf..aef253e683ec 100644
--- a/src/math/matrix.c
+++ b/src/math/matrix.c
@@ -31,50 +31,61 @@ int mat_multiply(struct mat_matrix_16b *a, struct mat_matrix_16b *b, struct mat_
 	if (a->columns != b->rows || a->rows != c->rows || b->columns != c->columns)
 		return -EINVAL;
 
-	int32_t s;  /* Changed from int64_t to int32_t */
-	int16_t *x;
-	int16_t *y;
-	int16_t *z = c->data;
-	int i, j, k;
-	int y_inc = b->columns;
-	const int shift_minus_one = a->fractions + b->fractions - c->fractions - 1;
+	int32_t acc;	/* Accumulator for dot product calculation */
+	int16_t *x, *y, *z = c->data; /* Pointers for matrices a, b, and c */
+	int i, j, k;	/* Loop counters */
+	int y_inc = b->columns;	   /* Column increment for matrix b elements */
+	/* Calculate shift amount for adjusting fractional bits in the result */
+	const int shift = a->fractions + b->fractions - c->fractions - 1;
 
 	/* Check shift to ensure no integer overflow occurs during shifting */
-	if (shift_minus_one < -1 || shift_minus_one > 31)
+	if (shift < -1 || shift > 31)
 		return -ERANGE;
 
-	/* If all data is Q0 */
-	if (shift_minus_one == -1) {
+	/* Special case when shift is -1 (Q0 data) */
+	if (shift == -1) {
+		/* Matrix multiplication loop */
 		for (i = 0; i < a->rows; i++) {
 			for (j = 0; j < b->columns; j++) {
-				s = 0;
+				/* Initialize accumulator for each element */
+				acc = 0;
+				/* Set x at the start of ith row of a */
 				x = a->data + a->columns * i;
+				/* Set y at the top of jth column of b */
 				y = b->data + j;
+				/* Dot product loop */
 				for (k = 0; k < b->rows; k++) {
-					/* Enhanced pointer arithmetic */
-					s += (int32_t)(*x++) * (*y);
+					/* Multiply & accumulate */
+					acc += (int32_t)(*x++) * (*y);
+					 /* Move to next row in the current column of b */
 					y += y_inc;
 				}
 				/* Enhanced pointer arithmetic */
-				*z++ = (int16_t)s;
+				*z = (int16_t)acc;
+				z++; /* Move to the next element in the output matrix */
 			}
 		}
-
-		return 0;
-	}
-
-	for (i = 0; i < a->rows; i++) {
-		for (j = 0; j < b->columns; j++) {
-			s = 0;
-			x = a->data + a->columns * i;
-			y = b->data + j;
-			for (k = 0; k < b->rows; k++) {
+	} else {
+		/* General case for other shift values */
+		for (i = 0; i < a->rows; i++) {
+			for (j = 0; j < b->columns; j++) {
+				/* Initialize accumulator for each element */
+				acc = 0;
+				/* Set x at the start of ith row of a */
+				x = a->data + a->columns * i;
+				/* Set y at the top of jth column of b */
+				y = b->data + j;
+				/* Dot product loop */
+				for (k = 0; k < b->rows; k++) {
+					/* Multiply & accumulate Enhanced pointer arithmetic */
+					acc += (int32_t)(*x++) * (*y);
+					/* Move to next row in the current column of b */
+					y += y_inc;
+				}
 				/* Enhanced pointer arithmetic */
-				s += (int32_t)(*x++) * (*y);
-				y += y_inc;
+				*z = (int16_t)(((acc >> shift) + 1) >> 1); /*Shift to Qx.y */
+				z++; /* Move to the next element in the output matrix */
 			}
-			/* Enhanced pointer arithmetic */
-			*z++ = (int16_t)(((s >> shift_minus_one) + 1) >> 1); /*Shift to Qx.y */
 		}
 	}
 	return 0;

From 65c21f0df6c72026156aef566ee85e8babde7b1a Mon Sep 17 00:00:00 2001
From: Shriram Shastry <malladi.sastry@intel.com>
Date: Tue, 27 Aug 2024 19:29:16 +0530
Subject: [PATCH 6/7] Math: Improve pointer manipulation in
 mat_multiply_elementwise

- Enhanced data pointers for matrix elements
- Streamlined loop iteration for matrix element-wise
 multiplication
- Achieved a 0.09% performance improvement in cycle count

| Rows | Cols | Cycles | Max Error | RMS Error | Result|
+------+------+--------+-----------+-----------+-------+
| 5    | 6    | 3359   | 0.00      | 0.00      | Pass  |

Signed-off-by: Shriram Shastry <malladi.sastry@intel.com>
---
 src/math/matrix.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/math/matrix.c b/src/math/matrix.c
index aef253e683ec..0ca49f983abe 100644
--- a/src/math/matrix.c
+++ b/src/math/matrix.c
@@ -126,22 +126,15 @@ int mat_multiply_elementwise(struct mat_matrix_16b *a, struct mat_matrix_16b *b,
 
 	/* If all data is Q0 */
 	if (shift_minus_one == -1) {
-		for (i = 0; i < a->rows * a->columns; i++) {
+		for (i = 0; i < a->rows * a->columns; i++, x++, y++, z++)
 			*z = *x * *y;
-			x++;
-			y++;
-			z++;
-		}
 
 		return 0;
 	}
 
-	for (i = 0; i < a->rows * a->columns; i++) {
+	for (i = 0; i < a->rows * a->columns; i++, x++, y++, z++) {
 		p = (int32_t)(*x) * *y;
-		*z = (int16_t)(((p >> shift_minus_one) + 1) >> 1); /*Shift to Qx.y */
-		x++;
-		y++;
-		z++;
+		*z = (int16_t)(((p >> shift_minus_one) + 1) >> 1); /* Shift to Qx.y */
 	}
 
 	return 0;

From c3eeab12e5526c1bf8e0311a3a4a1d7f81b83cad Mon Sep 17 00:00:00 2001
From: Shriram Shastry <malladi.sastry@intel.com>
Date: Tue, 27 Aug 2024 20:25:32 +0530
Subject: [PATCH 7/7] Math: Switch mat_multiply_elementwise product type to
 int32_t

- Changed product variable from int64_t to int32_t
- Improved performance by reducing data size
- Achieved a 11.57% performance improvement in cycle count

| Rows | Cols | Cycles | Max Error | RMS Error | Result |
+------+------+--------+-----------+-----------+--------+
| 5    | 6    | 2972   | 0.00	   | 0.00      | Pass   |

Signed-off-by: Shriram Shastry <malladi.sastry@intel.com>
---
 src/math/matrix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/math/matrix.c b/src/math/matrix.c
index 0ca49f983abe..100e6da1bc02 100644
--- a/src/math/matrix.c
+++ b/src/math/matrix.c
@@ -120,7 +120,7 @@ int mat_multiply_elementwise(struct mat_matrix_16b *a, struct mat_matrix_16b *b,
 	int16_t *x = a->data;
 	int16_t *y = b->data;
 	int16_t *z = c->data;
-	int64_t p;
+	int32_t p;
 	int i;
 	const int shift_minus_one = a->fractions + b->fractions - c->fractions - 1;