From f44f61b31f4d52cb2ddd4c395665424fc238673e Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Wed, 5 Feb 2025 16:54:33 +0200
Subject: [PATCH 1/4] Audio: SRC: Remove unnecessary initialization from
 fir_filter()

The intrinsics "functions" set these in their first use despite
it doesn't appear so if looking the code like normal C-functions.
E.g. this code line loads data from address coepf into coef2 and
advances coefp.

	AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/audio/src/src_hifi4.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/audio/src/src_hifi4.c b/src/audio/src/src_hifi4.c
index 1106968f4085..18130cd7f63c 100644
--- a/src/audio/src/src_hifi4.c
+++ b/src/audio/src/src_hifi4.c
@@ -180,10 +180,10 @@ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
 	 */
 	ae_f64 a0;
 	ae_f64 a1;
-	ae_f24x2 data2 = AE_ZERO24();
-	ae_f24x2 coef2 = AE_ZERO24();
-	ae_f24x2 d0 = AE_ZERO24();
-	ae_f24x2 d1 = AE_ZERO24();
+	ae_f24x2 data2;
+	ae_f24x2 coef2;
+	ae_f24x2 d0;
+	ae_f24x2 d1;
 	ae_f24x2 *coefp;
 	ae_f24x2 *dp;
 	ae_f24 *dp1;

From 92a4bf68394efe7bea8d444d57e52723f85d7657 Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Wed, 5 Feb 2025 17:14:58 +0200
Subject: [PATCH 2/4] Audio: SRC: Add HiFi5 version, as copy of HiFi4 version

Copying the file as such from HiFi4 and adding it to build
as preparation for optimizations.

The src_config.h is updated to follow the kconfig of FILTER
(CONFIG_FILTER_HIFI_NONE, CONFIG_FILTER_HIFI_3, etc.). The
gcc build is no more forced to use 16 bit SRC mode. It's a
limitation from HiFi2 platform BYT. The new platforms are
powerful enough to run it.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/audio/src/CMakeLists.txt |   2 +-
 src/audio/src/src_config.h   |  61 +---
 src/audio/src/src_hifi5.c    | 557 +++++++++++++++++++++++++++++++++++
 zephyr/CMakeLists.txt        |   1 +
 4 files changed, 574 insertions(+), 47 deletions(-)
 create mode 100644 src/audio/src/src_hifi5.c

diff --git a/src/audio/src/CMakeLists.txt b/src/audio/src/CMakeLists.txt
index bfc6361afedb..111b2484b4da 100644
--- a/src/audio/src/CMakeLists.txt
+++ b/src/audio/src/CMakeLists.txt
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
-add_local_sources(sof src_generic.c src_hifi2ep.c src_hifi3.c src_hifi4.c src_common.c src.c)
+add_local_sources(sof src_generic.c src_hifi2ep.c src_hifi3.c src_hifi4.c src_hifi5.c src_common.c src.c)
 
 if(CONFIG_IPC_MAJOR_3)
 	add_local_sources(sof src_ipc3.c)
diff --git a/src/audio/src/src_config.h b/src/audio/src/src_config.h
index 01a60230093c..9e53704b123f 100644
--- a/src/audio/src/src_config.h
+++ b/src/audio/src/src_config.h
@@ -8,55 +8,24 @@
 #ifndef __SOF_AUDIO_SRC_SRC_CONFIG_H__
 #define __SOF_AUDIO_SRC_SRC_CONFIG_H__
 
-
-
-/* If next define is set to 1 the SRC is configured automatically. Setting
- * to zero temporarily is useful is for testing needs.
- */
-#define SRC_AUTOARCH    1
-
-/* Force manually some code variant when SRC_AUTODSP is set to zero. These
- * are useful in code debugging.
- */
-#if SRC_AUTOARCH == 0
-#define SRC_GENERIC	1
-#define SRC_HIFIEP	0
-#define SRC_HIFI3	0
-#endif
-
-/* Select optimized code variant when xt-xcc compiler is used */
-#if SRC_AUTOARCH == 1
-#if defined __XCC__
-#include <xtensa/config/core-isa.h>
-#define SRC_GENERIC	0
-#if XCHAL_HAVE_HIFI4 == 1
-#define SRC_HIFI4	1
-#define SRC_HIFI3	0
-#define SRC_HIFIEP	0
+#include <sof/common.h>
+
+/* Follow kconfig for FILTER in SRC component */
+#if SOF_USE_MIN_HIFI(5, FILTER)
+#define SRC_HIFI5      1
+#elif SOF_USE_MIN_HIFI(4, FILTER)
+#define SRC_HIFI4      1
+#elif SOF_USE_MIN_HIFI(3, FILTER)
+#define SRC_HIFI3      1
+#elif SOF_USE_HIFI(2, FILTER)
+#define SRC_HIFIEP     1
 #else
-#define SRC_HIFI4	0
-#if XCHAL_HAVE_HIFI2EP == 1
-#define SRC_HIFIEP	1
-#define SRC_HIFI3	0
-#endif
-#if XCHAL_HAVE_HIFI3 == 1
-#define SRC_HIFI3	1
-#define SRC_HIFIEP	0
-#endif
-#endif
-#else
-/* GCC */
-#define SRC_GENERIC	1
-#define SRC_HIFIEP	0
-#define SRC_HIFI3	0
-#if CONFIG_LIBRARY
-#else
-#define SRC_SHORT	1 /* Need to use for generic code version speed */
-#endif
-#endif
+#define SRC_GENERIC    1
 #endif
 
-/* Kconfig option tiny needs 16 bits coefficients, other options use 32 bits */
+/* Kconfig option tiny needs 16 bits coefficients, other options use 32 bits,
+ * also gcc builds for all platforms and testbench (library)
+ */
 #if !defined(SRC_SHORT)
 #if CONFIG_COMP_SRC_TINY
 #define SRC_SHORT	1 /* 16 bit coefficients filter core */
diff --git a/src/audio/src/src_hifi5.c b/src/audio/src/src_hifi5.c
new file mode 100644
index 000000000000..e53e1ceab9d6
--- /dev/null
+++ b/src/audio/src/src_hifi5.c
@@ -0,0 +1,557 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2022-2025 Intel Corporation.
+//
+// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+//         Krzysztof Frydryk <krzysztofx.frydryk@intel.com>
+
+/* HiFi5 optimized code parts for SRC */
+
+#include "src_config.h"
+
+#if SRC_HIFI5
+
+#include "src_common.h"
+
+#include <sof/math/numbers.h>
+#include <xtensa/config/defs.h>
+#include <xtensa/tie/xt_hifi5.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* sof/math/numbers.h doesn't define MIN when used with zephyr */
+#ifdef __ZEPHYR__
+#include <zephyr/sys/util.h>
+#endif /* __ZEPHYR__ */
+
+/* HiFi4 has
+ * 16x 64 bit registers in register file AE_DR
+ */
+
+#if SRC_SHORT /* 16 bit coefficients version */
+
+static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+			      const int taps_div_4, const int shift,
+			      const int nch)
+{
+	/* This function uses
+	 * 6x 64 bit registers
+	 * 3x integers
+	 * 5x address pointers,
+	 */
+	ae_f64 a0;
+	ae_f64 a1;
+	ae_valign u;
+	ae_f16x4 coef4;
+	ae_f32x2 d0;
+	ae_f32x2 d1;
+	ae_f32x2 data2;
+	ae_f16x4 *coefp;
+	ae_f32x2 *dp;
+	ae_f32 *dp0;
+	ae_f32 *dp1;
+	int i;
+	int j;
+	ae_f32 *wp = wp0;
+	const int inc = nch * sizeof(int32_t);
+
+	if (nch == 2) {
+		/* Move data pointer back by one sample to start from right
+		 * channel sample. Discard read value p0.
+		 */
+		dp = (ae_f32x2 *)rp;
+		AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_f16x4 *)cp;
+		a0 = AE_ZERO64();
+		a1 = AE_ZERO64();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration.  Four coefficients
+		 * are loaded simultaneously. Data is read
+		 * from interleaved buffer with stride of channels
+		 * count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load four coefficients */
+			AE_LA16X4_IP(coef4, u, coefp);
+
+			/* Load two data samples from two channels */
+			AE_L64_XC(d0, dp, inc); /* r0, l0 */
+			AE_L64_XC(d1, dp, inc); /* r1, l1 */
+
+			/* Select to data2 sequential samples from a channel
+			 * and then accumulate to a0 and a1
+			 * data2_h * coef4_3 + data2_l * coef4_2.
+			 * The data is 32 bits Q1.31 and coefficient 16 bits
+			 * Q1.15. The accumulators are Q17.47.
+			 */
+			data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */
+			AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
+			data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */
+			AE_MULAAFD32X16_H3_L2(a1, data2, coef4);
+
+			/* Load two data samples from two channels */
+			AE_L64_XC(d0, dp, inc); /* r2, l2 */
+			AE_L64_XC(d1, dp, inc); /* r3, l3 */
+
+			/* Accumulate
+			 * data2_h * coef4_1 + data2_l * coef4_0.
+			 */
+			data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */
+			AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
+			data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */
+			AE_MULAAFD32X16_H1_L0(a1, data2, coef4);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate
+		 * to Q1.31, and store 32 bit output.
+		 */
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+			    sizeof(int32_t));
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
+			    sizeof(int32_t));
+
+		return;
+	}
+
+	dp1 = (ae_f32 *)rp;
+	for (j = 0; j < nch; j++) {
+		/* Copy pointer and advance to next ch with dummy load */
+		dp0 = dp1;
+		AE_L32_XC(d0, dp1, -sizeof(ae_f32));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_f16x4 *)cp;
+		a0 = AE_ZERO64();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration. Data is read from
+		 * interleaved buffer with stride of channels count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load four coefficients */
+			AE_LA16X4_IP(coef4, u, coefp);
+
+			/* Load two data samples, place to high and
+			 * low of data2.
+			 */
+			AE_L32_XC(d0, dp0, inc);
+			AE_L32_XC(d1, dp0, inc);
+			data2 = AE_SEL32_LL(d0, d1);
+
+			/* Accumulate
+			 * data2_h * coef4_3 + data2_l* coef4_2.
+			 * The data is 32 bits Q1.31 and coefficient 16 bits
+			 * Q1.15. The accumulator is Q17.47.
+			 */
+			AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
+
+			/* Repeat with next two samples */
+			AE_L32_XC(d0, dp0, inc);
+			AE_L32_XC(d1, dp0, inc);
+			data2 = AE_SEL32_LL(d0, d1);
+
+			/* Accumulate
+			 * data2_h * coef4_1 + data2_l * coef4_0.
+			 */
+			AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate Q17.47
+		 * to Q1.31, and store 32 bit output. Advance write
+		 * pointer to next sample.
+		 */
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+			    sizeof(int32_t));
+	}
+}
+
+#else /* 32bit coefficients version */
+
+static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+			      const int taps_div_4, const int shift,
+			      const int nch)
+{
+	/* This function uses
+	 * 6x 64 bit registers
+	 * 3x integers
+	 * 5x address pointers,
+	 */
+	ae_f64 a0;
+	ae_f64 a1;
+	ae_f24x2 data2;
+	ae_f24x2 coef2;
+	ae_f24x2 d0;
+	ae_f24x2 d1;
+	ae_f24x2 *coefp;
+	ae_f24x2 *dp;
+	ae_f24 *dp1;
+	ae_f24 *dp0;
+	int i;
+	int j;
+	ae_f32 *wp = wp0;
+	const int inc = nch * sizeof(int32_t);
+
+	if (nch == 2) {
+		/* Move data pointer back by one sample to start from right
+		 * channel sample. Discard read value p0.
+		 */
+		dp = (ae_f24x2 *)rp;
+		AE_L32F24_XC(d0, (ae_f24 *)dp, -sizeof(ae_f24));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_f24x2 *)cp;
+		a0 = AE_ZERO64();
+		a1 = AE_ZERO64();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration.  Two coefficients
+		 * are loaded simultaneously. Data is read
+		 * from interleaved buffer with stride of channels
+		 * count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load two coefficients. Coef2_h contains tap *coefp
+			 * and coef2_l contains the next tap.
+			 */
+			/* TODO: Ensure coefficients are 64 bits aligned */
+			AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
+
+			/* Load two data samples from two channels */
+			AE_L32X2F24_XC(d0, dp, inc); /* r0, l0 */
+			AE_L32X2F24_XC(d1, dp, inc); /* r1, l1 */
+
+			/* Select to data2 two successive left channel samples
+			 * from d0 and d1, multiply-add and accumulate to a0.
+			 * Select to data2 two successive right channel samples
+			 * from d0 and d1, multiply-add and accumulate to a1.
+			 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
+			 * data and Q1.15 coefficients are used as 24 bits as
+			 * Q1.23 values.
+			 */
+			data2 = AE_SELP24_LL(d0, d1);
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+			data2 = AE_SELP24_HH(d0, d1);
+			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+
+			/* Repeat for next two taps */
+			AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
+			AE_L32X2F24_XC(d0, dp, inc); /* r2, l2 */
+			AE_L32X2F24_XC(d1, dp, inc); /* r3, l3 */
+			data2 = AE_SELP24_LL(d0, d1);
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+			data2 = AE_SELP24_HH(d0, d1);
+			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate
+		 * to Q1.31, and store 32 bit output.
+		 */
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+			    sizeof(int32_t));
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
+			    sizeof(int32_t));
+
+		return;
+	}
+
+	dp1 = (ae_f24 *)rp;
+	for (j = 0; j < nch; j++) {
+		/* Copy pointer and advance to next ch with dummy load */
+		dp0 = dp1;
+		AE_L32F24_XC(data2, dp1, -sizeof(ae_f24));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_f24x2 *)cp;
+		a0 = AE_ZERO64();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration. Data is read from
+		 * interleaved buffer with stride of channels count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load two coefficients */
+			coef2 = *coefp++;
+
+			/* Load two data samples, place to high and
+			 * low of data2.
+			 */
+			AE_L32F24_XC(d0, dp0, inc);
+			AE_L32F24_XC(d1, dp0, inc);
+			data2 = AE_SELP24_LL(d0, d1);
+
+			/* Accumulate to data2_h * coef2_h +
+			 * data2_l*coef2_l. The Q1.31 bit data is used
+			 * as Q1.23 from MSB side bits of the 32 bit
+			 * word. The accumulator m is Q17.47.
+			 */
+			AE_MULAAFD24_HH_LL(a0, data2, coef2);
+
+			/* Repeat the same for next two filter taps */
+			coef2 = *coefp++;
+			AE_L32F24_XC(d0, dp0, inc);
+			AE_L32F24_XC(d1, dp0, inc);
+			data2 = AE_SELP24_LL(d0, d1);
+			AE_MULAAFD24_HH_LL(a0, data2, coef2);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate Q17.47
+		 * to Q1.31, and store 32 bit output. Advance write
+		 * pointer to next sample.
+		 */
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+			    sizeof(int32_t));
+	}
+}
+
+#endif /* 32bit coefficients version */
+
+#if CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE
+void src_polyphase_stage_cir(struct src_stage_prm *s)
+{
+	/* This function uses
+	 *  1x 64 bit registers
+	 * 16x integers
+	 * 11x address pointers,
+	 */
+	ae_int32x2 q = AE_ZERO32();
+	ae_f32 *rp;
+	ae_f32 *wp;
+	int i;
+	int n;
+	int m;
+	int n_wrap_buf;
+	int n_min;
+	struct src_state *fir = s->state;
+	const struct src_stage *cfg = s->stage;
+	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+	const void *cp; /* Can be int32_t or int16_t */
+	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+	const int nch = s->nch;
+	const int nch_x_odm = cfg->odm * nch;
+	const int blk_in_words = nch * cfg->blk_in;
+	const int blk_out_words = nch * cfg->num_of_subfilters;
+	const int sz = sizeof(int32_t);
+	const int n_sz = -sizeof(int32_t);
+	const int rewind_sz = sz * nch * (cfg->blk_in + (cfg->num_of_subfilters - 1) * cfg->idm);
+	const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
+	const int taps_div_4 = cfg->subfilter_length >> 2;
+	int32_t *x_rptr = (int32_t *)s->x_rptr;
+	int32_t *y_wptr = (int32_t *)s->y_wptr;
+	int32_t *x_end_addr = (int32_t *)s->x_end_addr;
+	int32_t *y_end_addr = (int32_t *)s->y_end_addr;
+
+#if SRC_SHORT
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+	for (n = 0; n < s->times; n++) {
+		/* Input data to filter */
+
+		/* Setup circular buffer for FIR input data delay */
+		AE_SETCBEGIN0(fir->fir_delay);
+		AE_SETCEND0(fir_end);
+
+		for (m = blk_in_words; m > 0; m -= n_min) {
+			/* Number of words until circular wrap */
+			n_wrap_buf = x_end_addr - x_rptr;
+			n_min = MIN(m, n_wrap_buf);
+			for (i = 0; i < n_min; i++) {
+				/* Load 32 bits sample to accumulator,
+				 * advance pointer, shift left with saturate.
+				 */
+				AE_L32_XP(q, (ae_int32 *)x_rptr, sz);
+				q = AE_SLAA32(q, s->shift);
+
+				/* Store to circular buffer, advance pointer */
+				AE_S32_L_XC(q, (ae_int32 *)fir->fir_wp, n_sz);
+			}
+
+			/* Check for wrap */
+			src_inc_wrap(&x_rptr, x_end_addr, s->x_size);
+		}
+
+		/* Do filter */
+		cp = cfg->coefs; /* Reset to 1st coefficient */
+		rp = (ae_f32 *)fir->fir_wp;
+
+		/* Do circular modification to pointer rp by amount of
+		 * rewind to data start. Loaded value q is discarded.
+		 */
+		AE_L32_XC(q, rp, rewind_sz);
+
+		/* Reset FIR write pointer and compute all polyphase
+		 * sub-filters.
+		 */
+		wp = (ae_f32 *)fir->out_rp;
+		for (i = 0; i < cfg->num_of_subfilters; i++) {
+			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+			wp += nch_x_odm;
+			cp = (uint8_t *)cp + subfilter_size;
+			src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
+
+			/* Circular advance pointer rp by number of
+			 * channels x input delay multiplier. Loaded value q
+			 * is discarded.
+			 */
+			AE_L32_XC(q, rp, nch_x_idm_sz);
+		}
+
+		/* Output */
+
+		/* Setup circular buffer for SRC out delay access */
+		AE_SETCBEGIN0(fir->out_delay);
+		AE_SETCEND0(out_delay_end);
+		for (m = blk_out_words; m > 0; m -= n_min) {
+			n_wrap_buf = y_end_addr - y_wptr;
+			n_min = MIN(m, n_wrap_buf);
+			for (i = 0; i < n_min; i++) {
+				/* Circular load, shift right, linear store,
+				 * and advance read and write pointers.
+				 */
+				AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz);
+				q = AE_SRAA32(q, s->shift);
+				AE_S32_L_XP(q, (ae_int32 *)y_wptr, sz);
+			}
+
+			/* Check wrap */
+			src_inc_wrap(&y_wptr, y_end_addr, s->y_size);
+		}
+	}
+	s->x_rptr = x_rptr;
+	s->y_wptr = y_wptr;
+}
+#endif /* CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE */
+
+#if CONFIG_FORMAT_S16LE
+void src_polyphase_stage_cir_s16(struct src_stage_prm *s)
+{
+	/* This function uses
+	 *  2x 64 bit registers
+	 * 16x integers
+	 * 11x address pointers,
+	 */
+	ae_int32x2 q = AE_ZERO32();
+	ae_int16x4 d = AE_ZERO16();
+	ae_f32 *rp;
+	ae_f32 *wp;
+	int i;
+	int n;
+	int m;
+	int n_wrap_buf;
+	int n_min;
+
+	struct src_state *fir = s->state;
+	const struct src_stage *cfg = s->stage;
+	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+	const void *cp; /* Can be int32_t or int16_t */
+	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+	const int nch = s->nch;
+	const int nch_x_odm = cfg->odm * nch;
+	const int blk_in_words = nch * cfg->blk_in;
+	const int blk_out_words = nch * cfg->num_of_subfilters;
+	const int sz = sizeof(int32_t);
+	const int n_sz = -sizeof(int32_t);
+	const int rewind_sz = sz * nch * (cfg->blk_in + (cfg->num_of_subfilters - 1) * cfg->idm);
+	const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
+	const int taps_div_4 = cfg->subfilter_length >> 2;
+	int16_t *x_rptr = (int16_t *)s->x_rptr;
+	int16_t *y_wptr = (int16_t *)s->y_wptr;
+	int16_t *x_end_addr = (int16_t *)s->x_end_addr;
+	int16_t *y_end_addr = (int16_t *)s->y_end_addr;
+
+#if SRC_SHORT
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+	for (n = 0; n < s->times; n++) {
+		/* Input data */
+
+		/* Setup circular buffer for FIR input data delay */
+		AE_SETCBEGIN0(fir->fir_delay);
+		AE_SETCEND0(fir_end);
+		for (m = blk_in_words; m > 0; m -= n_min) {
+			/* Number of words without circular wrap */
+			n_wrap_buf = x_end_addr - x_rptr;
+			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+			for (i = 0; i < n_min; i++) {
+				/* Load a 16 bits sample into d and left shift
+				 * by 16 into q, advance read and write
+				 * pointers.
+				 */
+				AE_L16_XP(d, (ae_int16 *)x_rptr,
+					  sizeof(ae_int16));
+				q = AE_CVT32X2F16_32(d);
+				AE_S32_L_XC(q, (ae_int32 *)fir->fir_wp, n_sz);
+			}
+
+			/* Check for wrap */
+			src_inc_wrap_s16(&x_rptr, x_end_addr, s->x_size);
+		}
+
+		/* Do filter */
+		cp = cfg->coefs; /* Reset to 1st coefficient */
+		rp = (ae_f32 *)fir->fir_wp;
+
+		/* Do circular modification to pointer rp by amount of
+		 * rewind to data start. Loaded value q is discarded.
+		 */
+		AE_L32_XC(q, rp, rewind_sz);
+
+		/* Reset FIR output write pointer and compute all polyphase
+		 * sub-filters.
+		 */
+		wp = (ae_f32 *)fir->out_rp;
+		for (i = 0; i < cfg->num_of_subfilters; i++) {
+			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+			wp += nch_x_odm;
+			cp = (uint8_t *)cp + subfilter_size;
+			src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
+
+			/* Circular advance pointer rp by number of
+			 * channels x input delay multiplier. Loaded value q
+			 * is discarded.
+			 */
+			AE_L32_XC(q, rp, nch_x_idm_sz);
+		}
+
+		/* Output */
+
+		/* Setup circular buffer for SRC out delay access */
+		AE_SETCBEGIN0(fir->out_delay);
+		AE_SETCEND0(out_delay_end);
+		for (m = blk_out_words; m > 0; m -= n_min) {
+			n_wrap_buf = y_end_addr - y_wptr;
+			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+			for (i = 0; i < n_min; i++) {
+				/* Circular load for 32 bit sample,
+				 * advance read pointer.
+				 */
+				AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz);
+
+				/* Store Q1.31 value as Q1.15 and
+				 * advance write pointer.
+				 */
+				d = AE_ROUND16X4F32SSYM(q, q);
+				AE_S16_0_XP(d, (ae_int16 *)y_wptr,
+					    sizeof(ae_int16));
+			}
+
+			/* Check wrap */
+			src_inc_wrap_s16(&y_wptr, y_end_addr, s->y_size);
+		}
+	}
+	s->x_rptr = x_rptr;
+	s->y_wptr = y_wptr;
+}
+#endif /* CONFIG_FORMAT_S16LE */
+
+#endif
diff --git a/zephyr/CMakeLists.txt b/zephyr/CMakeLists.txt
index 4c2ffbc3db4d..77050ac54c20 100644
--- a/zephyr/CMakeLists.txt
+++ b/zephyr/CMakeLists.txt
@@ -867,6 +867,7 @@ elseif(CONFIG_COMP_SRC)
 		${SOF_AUDIO_PATH}/src/src_generic.c
 		${SOF_AUDIO_PATH}/src/src_hifi3.c
 		${SOF_AUDIO_PATH}/src/src_hifi4.c
+		${SOF_AUDIO_PATH}/src/src_hifi5.c
 		${SOF_AUDIO_PATH}/src/src_common.c
 		${SOF_AUDIO_PATH}/src/src.c
 		${SOF_AUDIO_PATH}/src/src_${ipc_suffix}.c

From 2943ba2b32383115d69cf342481acb3f9745d855 Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Wed, 5 Feb 2025 18:11:56 +0200
Subject: [PATCH 3/4] Audio: SRC: Optimize HiFi5 version code

This patch contains code optimizations in both generic C and with
Xtensa HiFi5 intrinsics

- As a fix for code the coefficients read align register is primed
  for 16 bit coefficients version. Without priming the load works
  correctly only for aligned addresses.
- The FIR filter functions are separated for stereo and any channels
  count versions. It avoids channels count check for every
  intermediate and final sample value.
- The 32 bit coefficiens version is similarly separated for stereo
  and any channels version.
- In 32 bits version the coefficients load is changed to 128 bits
  load with aligning. Data load can't be similarly enhanced due
  to frame granularity shift in data in polyphase filters matrix
  compute.
- The dual-MAC for FIR calculate is changed to quad-MAC
- The data store is changed in stereo version to 64 bit store of
  stereo frame.
- In src_polyphase_stage_cir() function the filters process loops
  are separated for stereo and for any channels count.

In a HiFi5 build sof sof-testbench4 the MCPS for stereo 32 bit
44.1 kHz to 48 kHz the performance improves from 18.37 to
16.26 MCPS.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/audio/src/src_hifi5.c | 420 ++++++++++++++++++++------------------
 1 file changed, 223 insertions(+), 197 deletions(-)

diff --git a/src/audio/src/src_hifi5.c b/src/audio/src/src_hifi5.c
index e53e1ceab9d6..0adfc4796d25 100644
--- a/src/audio/src/src_hifi5.c
+++ b/src/audio/src/src_hifi5.c
@@ -24,20 +24,21 @@
 #include <zephyr/sys/util.h>
 #endif /* __ZEPHYR__ */
 
-/* HiFi4 has
- * 16x 64 bit registers in register file AE_DR
- */
+#if SRC_SHORT
+#define SRC_COEF_SIZE	sizeof(int16_t)
+#else
+#define SRC_COEF_SIZE	sizeof(int32_t)
+#endif
 
 #if SRC_SHORT /* 16 bit coefficients version */
 
-static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
-			      const int taps_div_4, const int shift,
-			      const int nch)
+static inline void fir_filter_2ch(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+				  const int taps_div_4, const int shift)
 {
 	/* This function uses
-	 * 6x 64 bit registers
-	 * 3x integers
-	 * 5x address pointers,
+	 * 7x 64 bit registers
+	 * 2x integers
+	 * 3x address pointers,
 	 */
 	ae_f64 a0;
 	ae_f64 a1;
@@ -48,73 +49,89 @@ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
 	ae_f32x2 data2;
 	ae_f16x4 *coefp;
 	ae_f32x2 *dp;
-	ae_f32 *dp0;
-	ae_f32 *dp1;
 	int i;
-	int j;
 	ae_f32 *wp = wp0;
-	const int inc = nch * sizeof(int32_t);
+	const int inc = 2 * sizeof(int32_t);
 
-	if (nch == 2) {
-		/* Move data pointer back by one sample to start from right
-		 * channel sample. Discard read value p0.
+	/* Move data pointer back by one sample to start from right
+	 * channel sample. Discard read value p0.
+	 */
+	dp = (ae_f32x2 *)rp;
+	AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32));
+
+	/* Reset coefficient pointer and clear accumulator */
+	coefp = (ae_f16x4 *)cp;
+	a0 = AE_ZERO64();
+	a1 = AE_ZERO64();
+	u = AE_LA64_PP(coefp);
+
+	/* Compute FIR filter for current channel with four
+	 * taps per every loop iteration.  Four coefficients
+	 * are loaded simultaneously. Data is read
+	 * from interleaved buffer with stride of channels
+	 * count.
+	 */
+	for (i = 0; i < taps_div_4; i++) {
+		/* Load four coefficients */
+		AE_LA16X4_IP(coef4, u, coefp);
+
+		/* Load two data samples from two channels */
+		AE_L32X2_XC(d0, dp, inc); /* r0, l0 */
+		AE_L32X2_XC(d1, dp, inc); /* r1, l1 */
+
+		/* Select to data2 sequential samples from a channel
+		 * and then accumulate to a0 and a1
+		 * data2_h * coef4_3 + data2_l * coef4_2.
+		 * The data is 32 bits Q1.31 and coefficient 16 bits
+		 * Q1.15. The accumulators are Q17.47.
 		 */
-		dp = (ae_f32x2 *)rp;
-		AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32));
+		data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */
+		AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
+		data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */
+		AE_MULAAFD32X16_H3_L2(a1, data2, coef4);
 
-		/* Reset coefficient pointer and clear accumulator */
-		coefp = (ae_f16x4 *)cp;
-		a0 = AE_ZERO64();
-		a1 = AE_ZERO64();
+		/* Load two data samples from two channels */
+		AE_L32X2_XC(d0, dp, inc); /* r2, l2 */
+		AE_L32X2_XC(d1, dp, inc); /* r3, l3 */
 
-		/* Compute FIR filter for current channel with four
-		 * taps per every loop iteration.  Four coefficients
-		 * are loaded simultaneously. Data is read
-		 * from interleaved buffer with stride of channels
-		 * count.
+		/* Accumulate
+		 * data2_h * coef4_1 + data2_l * coef4_0.
 		 */
-		for (i = 0; i < taps_div_4; i++) {
-			/* Load four coefficients */
-			AE_LA16X4_IP(coef4, u, coefp);
-
-			/* Load two data samples from two channels */
-			AE_L64_XC(d0, dp, inc); /* r0, l0 */
-			AE_L64_XC(d1, dp, inc); /* r1, l1 */
-
-			/* Select to data2 sequential samples from a channel
-			 * and then accumulate to a0 and a1
-			 * data2_h * coef4_3 + data2_l * coef4_2.
-			 * The data is 32 bits Q1.31 and coefficient 16 bits
-			 * Q1.15. The accumulators are Q17.47.
-			 */
-			data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */
-			AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
-			data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */
-			AE_MULAAFD32X16_H3_L2(a1, data2, coef4);
-
-			/* Load two data samples from two channels */
-			AE_L64_XC(d0, dp, inc); /* r2, l2 */
-			AE_L64_XC(d1, dp, inc); /* r3, l3 */
-
-			/* Accumulate
-			 * data2_h * coef4_1 + data2_l * coef4_0.
-			 */
-			data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */
-			AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
-			data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */
-			AE_MULAAFD32X16_H1_L0(a1, data2, coef4);
-		}
+		data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */
+		AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
+		data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */
+		AE_MULAAFD32X16_H1_L0(a1, data2, coef4);
+	}
 
-		/* Scale FIR output with right shifts, round/saturate
-		 * to Q1.31, and store 32 bit output.
-		 */
-		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
-			    sizeof(int32_t));
-		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
-			    sizeof(int32_t));
+	/* Scale FIR output with right shifts, round/saturate
+	 * to Q1.31, and store 32 bit output.
+	 */
+	AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, sizeof(int32_t));
+	AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, sizeof(int32_t));
+}
 
-		return;
-	}
+static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+			      const int taps_div_4, const int shift,
+			      const int nch)
+{
+	/* This function uses
+	 * 6x 64 bit registers
+	 * 3x integers
+	 * 3x address pointers,
+	 */
+	ae_f64 a0;
+	ae_valign u;
+	ae_f16x4 coef4;
+	ae_f32x2 d0;
+	ae_f32x2 d1;
+	ae_f32x2 data2;
+	ae_f16x4 *coefp;
+	ae_f32 *dp0;
+	ae_f32 *dp1;
+	int i;
+	int j;
+	ae_f32 *wp = wp0;
+	const int inc = nch * sizeof(int32_t);
 
 	dp1 = (ae_f32 *)rp;
 	for (j = 0; j < nch; j++) {
@@ -125,6 +142,7 @@ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
 		/* Reset coefficient pointer and clear accumulator */
 		coefp = (ae_f16x4 *)cp;
 		a0 = AE_ZERO64();
+		u = AE_LA64_PP(coefp);
 
 		/* Compute FIR filter for current channel with four
 		 * taps per every loop iteration. Data is read from
@@ -170,139 +188,141 @@ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
 
 #else /* 32bit coefficients version */
 
-static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
-			      const int taps_div_4, const int shift,
-			      const int nch)
+static inline void fir_filter_2ch(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+				  const int taps_div_4, const int shift)
 {
-	/* This function uses
-	 * 6x 64 bit registers
-	 * 3x integers
-	 * 5x address pointers,
-	 */
-	ae_f64 a0;
-	ae_f64 a1;
-	ae_f24x2 data2;
-	ae_f24x2 coef2;
-	ae_f24x2 d0;
-	ae_f24x2 d1;
-	ae_f24x2 *coefp;
-	ae_f24x2 *dp;
-	ae_f24 *dp1;
-	ae_f24 *dp0;
-	int i;
-	int j;
+	ae_valignx2 coef_align;
+	ae_f64 a0 = AE_ZERO64();
+	ae_f64 a1 = AE_ZERO64();
+	ae_f32x2 coef32;
+	ae_f32x2 coef10;
+	ae_f32x2 left10;
+	ae_f32x2 right10;
+	ae_f32x2 f0;
+	ae_f32x2 f1;
+	ae_int32x4 *coefp;
+	ae_f32x2 *dp;
 	ae_f32 *wp = wp0;
-	const int inc = nch * sizeof(int32_t);
-
-	if (nch == 2) {
-		/* Move data pointer back by one sample to start from right
-		 * channel sample. Discard read value p0.
-		 */
-		dp = (ae_f24x2 *)rp;
-		AE_L32F24_XC(d0, (ae_f24 *)dp, -sizeof(ae_f24));
+	const int inc = 2 * sizeof(int32_t);
+	int i;
 
-		/* Reset coefficient pointer and clear accumulator */
-		coefp = (ae_f24x2 *)cp;
-		a0 = AE_ZERO64();
-		a1 = AE_ZERO64();
+	/* Move data pointer back by one sample to start from right
+	 * channel sample. Discard read value p0.
+	 */
+	dp = (ae_f32x2 *)rp;
+	AE_L32_XC(f0, (ae_f32 *)dp, -sizeof(ae_f32));
+
+	/* Reset coefficient pointer and clear accumulator */
+	coefp = (ae_int32x4 *)cp;
+	coef_align = AE_LA128_PP(coefp);
+
+	/* Compute FIR filter for current channel with four
+	 * taps per every loop iteration.  Two coefficients
+	 * are loaded simultaneously. Data is read
+	 * from interleaved buffer with stride of channels
+	 * count.
+	 */
+	for (i = 0; i < taps_div_4; i++) {
+		/* Load four coefficients */
+		AE_LA32X2X2_IP(coef10, coef32, coef_align, coefp);
 
-		/* Compute FIR filter for current channel with four
-		 * taps per every loop iteration.  Two coefficients
-		 * are loaded simultaneously. Data is read
-		 * from interleaved buffer with stride of channels
-		 * count.
+		/* Load two data samples from two channels. Note: Due to
+		 * polyphase array data start shift for sub-filters can't
+		 * use 128 bits load due to align requirement.
 		 */
-		for (i = 0; i < taps_div_4; i++) {
-			/* Load two coefficients. Coef2_h contains tap *coefp
-			 * and coef2_l contains the next tap.
-			 */
-			/* TODO: Ensure coefficients are 64 bits aligned */
-			AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
-
-			/* Load two data samples from two channels */
-			AE_L32X2F24_XC(d0, dp, inc); /* r0, l0 */
-			AE_L32X2F24_XC(d1, dp, inc); /* r1, l1 */
-
-			/* Select to data2 two successive left channel samples
-			 * from d0 and d1, multiply-add and accumulate to a0.
-			 * Select to data2 two successive right channel samples
-			 * from d0 and d1, multiply-add and accumulate to a1.
-			 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
-			 * data and Q1.15 coefficients are used as 24 bits as
-			 * Q1.23 values.
-			 */
-			data2 = AE_SELP24_LL(d0, d1);
-			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
-			data2 = AE_SELP24_HH(d0, d1);
-			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
-
-			/* Repeat for next two taps */
-			AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
-			AE_L32X2F24_XC(d0, dp, inc); /* r2, l2 */
-			AE_L32X2F24_XC(d1, dp, inc); /* r3, l3 */
-			data2 = AE_SELP24_LL(d0, d1);
-			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
-			data2 = AE_SELP24_HH(d0, d1);
-			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
-		}
+		AE_L32X2_XC(f0, dp, inc); /* f0.h is left0, f0.l is right0 */
+		AE_L32X2_XC(f1, dp, inc); /* f1.h is left1, f1.l is right1 */
 
-		/* Scale FIR output with right shifts, round/saturate
-		 * to Q1.31, and store 32 bit output.
+		/* a0 (left)  += left10.h  (left1)  * coef10.h (coef2)
+		 *            += left10.l  (left0)  * coef10.l (coef1)
+		 * a1 (right) += right10.h (right1) * coef10.h (coef2)
+		 *            += right10.l (right0) * coef10.l (coef1)
 		 */
-		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
-			    sizeof(int32_t));
-		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
-			    sizeof(int32_t));
-
-		return;
+		left10 = AE_SEL32_HH(f0, f1);
+		right10 = AE_SEL32_LL(f0, f1);
+		AE_MULAAF2D32RA_HH_LL(a0, a1, left10, right10, coef10, coef10);
+
+		/* Repeat for next two taps */
+		AE_L32X2_XC(f0, dp, inc); /* f0.h is left2, f0.l is right2 */
+		AE_L32X2_XC(f1, dp, inc); /* f1.h is left3, f1.l is right3 */
+		left10 = AE_SEL32_HH(f0, f1);
+		right10 = AE_SEL32_LL(f0, f1);
+		AE_MULAAF2D32RA_HH_LL(a0, a1, left10, right10, coef32, coef32);
 	}
 
-	dp1 = (ae_f24 *)rp;
+	/* Scale FIR output with right shifts, round/saturate
+	 * to Q1.31, and store 32 bit output.
+	 */
+	f0 = AE_ROUND32X2F48SASYM(AE_SRAA64(a1, shift), AE_SRAA64(a0, shift));
+	AE_S32X2_I(f0, (ae_int32x2 *)wp, 0);
+}
+
+static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+			      const int taps_div_4, const int shift,
+			      const int nch)
+{
+	ae_valignx2 coef_align;
+	ae_f64 a0 = AE_ZERO64();
+	ae_f32x2 coef32;
+	ae_f32x2 coef10;
+	ae_f32x2 sample10;
+	ae_f32x2 f0;
+	ae_f32x2 f1;
+	ae_int32x4 *coefp;
+	ae_f32 *dp;
+	ae_f32 *dp1 = rp;
+	ae_f32 *wp = wp0;
+	const int inc = nch * sizeof(int32_t);
+	int i;
+	int j;
+
 	for (j = 0; j < nch; j++) {
 		/* Copy pointer and advance to next ch with dummy load */
-		dp0 = dp1;
-		AE_L32F24_XC(data2, dp1, -sizeof(ae_f24));
+		dp = dp1;
+		AE_L32_XC(f0, dp1, -sizeof(ae_f32));
 
 		/* Reset coefficient pointer and clear accumulator */
-		coefp = (ae_f24x2 *)cp;
+		coefp = (ae_int32x4 *)cp;
 		a0 = AE_ZERO64();
+		coef_align = AE_LA128_PP(coefp);
 
 		/* Compute FIR filter for current channel with four
 		 * taps per every loop iteration. Data is read from
 		 * interleaved buffer with stride of channels count.
 		 */
 		for (i = 0; i < taps_div_4; i++) {
-			/* Load two coefficients */
-			coef2 = *coefp++;
+			/* Load four coefficients
+			 * TODO: Ensure coefficients are 128 bits aligned
+			 */
+			AE_LA32X2X2_IP(coef10, coef32, coef_align, coefp);
 
 			/* Load two data samples, place to high and
 			 * low of data2.
 			 */
-			AE_L32F24_XC(d0, dp0, inc);
-			AE_L32F24_XC(d1, dp0, inc);
-			data2 = AE_SELP24_LL(d0, d1);
+			AE_L32_XC(f0, dp, inc);
+			AE_L32_XC(f1, dp, inc);
+			sample10 = AE_SEL32_LL(f0, f1);
 
 			/* Accumulate to data2_h * coef2_h +
 			 * data2_l*coef2_l. The Q1.31 bit data is used
 			 * as Q1.23 from MSB side bits of the 32 bit
 			 * word. The accumulator m is Q17.47.
 			 */
-			AE_MULAAFD24_HH_LL(a0, data2, coef2);
+			AE_MULAAFD32RA_HH_LL(a0, sample10, coef10);
 
 			/* Repeat the same for next two filter taps */
-			coef2 = *coefp++;
-			AE_L32F24_XC(d0, dp0, inc);
-			AE_L32F24_XC(d1, dp0, inc);
-			data2 = AE_SELP24_LL(d0, d1);
-			AE_MULAAFD24_HH_LL(a0, data2, coef2);
+			AE_L32_XC(f0, dp, inc);
+			AE_L32_XC(f1, dp, inc);
+			sample10 = AE_SEL32_LL(f0, f1);
+			AE_MULAAFD32RA_HH_LL(a0, sample10, coef32);
 		}
 
 		/* Scale FIR output with right shifts, round/saturate Q17.47
 		 * to Q1.31, and store 32 bit output. Advance write
 		 * pointer to next sample.
 		 */
-		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
-			    sizeof(int32_t));
+		f0 = AE_ROUND32F48SASYM(AE_SRAA64(a0, shift));
+		AE_S32_L_XP(f0, wp, sizeof(int32_t));
 	}
 }
 
@@ -316,7 +336,7 @@ void src_polyphase_stage_cir(struct src_stage_prm *s)
 	 * 16x integers
 	 * 11x address pointers,
 	 */
-	ae_int32x2 q = AE_ZERO32();
+	ae_int32x2 q;
 	ae_f32 *rp;
 	ae_f32 *wp;
 	int i;
@@ -343,12 +363,7 @@ void src_polyphase_stage_cir(struct src_stage_prm *s)
 	int32_t *y_wptr = (int32_t *)s->y_wptr;
 	int32_t *x_end_addr = (int32_t *)s->x_end_addr;
 	int32_t *y_end_addr = (int32_t *)s->y_end_addr;
-
-#if SRC_SHORT
-	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
-#else
-	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
-#endif
+	const size_t subfilter_size = cfg->subfilter_length * SRC_COEF_SIZE;
 
 	for (n = 0; n < s->times; n++) {
 		/* Input data to filter */
@@ -389,17 +404,25 @@ void src_polyphase_stage_cir(struct src_stage_prm *s)
 		 * sub-filters.
 		 */
 		wp = (ae_f32 *)fir->out_rp;
-		for (i = 0; i < cfg->num_of_subfilters; i++) {
-			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
-			wp += nch_x_odm;
-			cp = (uint8_t *)cp + subfilter_size;
-			src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
-
-			/* Circular advance pointer rp by number of
-			 * channels x input delay multiplier. Loaded value q
-			 * is discarded.
-			 */
-			AE_L32_XC(q, rp, nch_x_idm_sz);
+		if (nch == 2) {
+			for (i = 0; i < cfg->num_of_subfilters; i++) {
+				fir_filter_2ch(rp, cp, wp, taps_div_4, cfg->shift);
+				wp += nch_x_odm;
+				cp = (uint8_t *)cp + subfilter_size;
+				src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
+				/* Circular advance pointer rp by number of channels x input
+				 * delay multiplier. Loaded value q is discarded.
+				 */
+				AE_L32_XC(q, rp, nch_x_idm_sz);
+			}
+		} else {
+			for (i = 0; i < cfg->num_of_subfilters; i++) {
+				fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+				wp += nch_x_odm;
+				cp = (uint8_t *)cp + subfilter_size;
+				src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
+				AE_L32_XC(q, rp, nch_x_idm_sz);
+			}
 		}
 
 		/* Output */
@@ -465,12 +488,7 @@ void src_polyphase_stage_cir_s16(struct src_stage_prm *s)
 	int16_t *y_wptr = (int16_t *)s->y_wptr;
 	int16_t *x_end_addr = (int16_t *)s->x_end_addr;
 	int16_t *y_end_addr = (int16_t *)s->y_end_addr;
-
-#if SRC_SHORT
-	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
-#else
-	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
-#endif
+	const size_t subfilter_size = cfg->subfilter_length * SRC_COEF_SIZE;
 
 	for (n = 0; n < s->times; n++) {
 		/* Input data */
@@ -510,17 +528,25 @@ void src_polyphase_stage_cir_s16(struct src_stage_prm *s)
 		 * sub-filters.
 		 */
 		wp = (ae_f32 *)fir->out_rp;
-		for (i = 0; i < cfg->num_of_subfilters; i++) {
-			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
-			wp += nch_x_odm;
-			cp = (uint8_t *)cp + subfilter_size;
-			src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
-
-			/* Circular advance pointer rp by number of
-			 * channels x input delay multiplier. Loaded value q
-			 * is discarded.
-			 */
-			AE_L32_XC(q, rp, nch_x_idm_sz);
+		if (nch == 2) {
+			for (i = 0; i < cfg->num_of_subfilters; i++) {
+				fir_filter_2ch(rp, cp, wp, taps_div_4, cfg->shift);
+				wp += nch_x_odm;
+				cp = (uint8_t *)cp + subfilter_size;
+				src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
+				/* Circular advance pointer rp by number of channels x input delay
+				 * multiplier. Loaded value q is discarded.
+				 */
+				AE_L32_XC(q, rp, nch_x_idm_sz);
+			}
+		} else {
+			for (i = 0; i < cfg->num_of_subfilters; i++) {
+				fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+				wp += nch_x_odm;
+				cp = (uint8_t *)cp + subfilter_size;
+				src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
+				AE_L32_XC(q, rp, nch_x_idm_sz);
+			}
 		}
 
 		/* Output */

From 3372cfb55ec188c15780fd121b77787e30f5ecf3 Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Mon, 10 Feb 2025 16:21:45 +0200
Subject: [PATCH 4/4] Audio: SRC: Optimize HiFi4 version code

This patch applies to HiF4 part of the optimizations done for HiFi5
version.

- As a fix for code the coefficients read align register is primed
  for 16 bit coefficients version. Without priming the load works
  correctly only for aligned addresses.
- The FIR filter functions are separated for stereo and any channels
  count versions. It avoids channels count check for every
  intermediate and final sample value.
- The 32 bit coefficients version is similarly separated for stereo
  and any channels version.
- The data loads are changed to more efficient 32 bits without
  s24 conversion. The dual-MAC is changed to 32x32 with Q17.47
  format.
- The data store is changed in stereo version to 64 bit store of
  stereo frame.
- In src_polyphase_stage_cir() function the filters process loops
  are separated for stereo and for any channels count.

The MCPS is improved from 18.03 to 16.00 MCPS in stereo 32 bit
44.1 kHz to 48 kHz conversion.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/audio/src/src_hifi4.c | 428 ++++++++++++++++++++------------------
 1 file changed, 229 insertions(+), 199 deletions(-)

diff --git a/src/audio/src/src_hifi4.c b/src/audio/src/src_hifi4.c
index 18130cd7f63c..ad3b8583113f 100644
--- a/src/audio/src/src_hifi4.c
+++ b/src/audio/src/src_hifi4.c
@@ -1,10 +1,11 @@
 // SPDX-License-Identifier: BSD-3-Clause
 //
-// Copyright(c) 2022 Intel Corporation. All rights reserved.
+// Copyright(c) 2022-2025 Intel Corporation.
 //
 // Author: Krzysztof Frydryk <krzysztofx.frydryk@intel.com>
+//         Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
 
-/* HiFi4 optimized code parts for SRC */
+/* HiFi5 optimized code parts for SRC */
 
 #include "src_config.h"
 
@@ -23,20 +24,21 @@
 #include <zephyr/sys/util.h>
 #endif /* __ZEPHYR__ */
 
-/* HiFi4 has
- * 16x 64 bit registers in register file AE_DR
- */
+#if SRC_SHORT
+#define SRC_COEF_SIZE	sizeof(int16_t)
+#else
+#define SRC_COEF_SIZE	sizeof(int32_t)
+#endif
 
 #if SRC_SHORT /* 16 bit coefficients version */
 
-static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
-			      const int taps_div_4, const int shift,
-			      const int nch)
+static inline void fir_filter_2ch(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+				  const int taps_div_4, const int shift)
 {
 	/* This function uses
-	 * 6x 64 bit registers
-	 * 3x integers
-	 * 5x address pointers,
+	 * 7x 64 bit registers
+	 * 2x integers
+	 * 3x address pointers,
 	 */
 	ae_f64 a0;
 	ae_f64 a1;
@@ -47,73 +49,89 @@ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
 	ae_f32x2 data2;
 	ae_f16x4 *coefp;
 	ae_f32x2 *dp;
-	ae_f32 *dp0;
-	ae_f32 *dp1;
 	int i;
-	int j;
 	ae_f32 *wp = wp0;
-	const int inc = nch * sizeof(int32_t);
+	const int inc = 2 * sizeof(int32_t);
 
-	if (nch == 2) {
-		/* Move data pointer back by one sample to start from right
-		 * channel sample. Discard read value p0.
+	/* Move data pointer back by one sample to start from right
+	 * channel sample. Discard read value p0.
+	 */
+	dp = (ae_f32x2 *)rp;
+	AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32));
+
+	/* Reset coefficient pointer and clear accumulator */
+	coefp = (ae_f16x4 *)cp;
+	a0 = AE_ZERO64();
+	a1 = AE_ZERO64();
+	u = AE_LA64_PP(coefp);
+
+	/* Compute FIR filter for current channel with four
+	 * taps per every loop iteration.  Four coefficients
+	 * are loaded simultaneously. Data is read
+	 * from interleaved buffer with stride of channels
+	 * count.
+	 */
+	for (i = 0; i < taps_div_4; i++) {
+		/* Load four coefficients */
+		AE_LA16X4_IP(coef4, u, coefp);
+
+		/* Load two data samples from two channels */
+		AE_L32X2_XC(d0, dp, inc); /* r0, l0 */
+		AE_L32X2_XC(d1, dp, inc); /* r1, l1 */
+
+		/* Select to data2 sequential samples from a channel
+		 * and then accumulate to a0 and a1
+		 * data2_h * coef4_3 + data2_l * coef4_2.
+		 * The data is 32 bits Q1.31 and coefficient 16 bits
+		 * Q1.15. The accumulators are Q17.47.
 		 */
-		dp = (ae_f32x2 *)rp;
-		AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32));
+		data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */
+		AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
+		data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */
+		AE_MULAAFD32X16_H3_L2(a1, data2, coef4);
 
-		/* Reset coefficient pointer and clear accumulator */
-		coefp = (ae_f16x4 *)cp;
-		a0 = AE_ZERO64();
-		a1 = AE_ZERO64();
+		/* Load two data samples from two channels */
+		AE_L32X2_XC(d0, dp, inc); /* r2, l2 */
+		AE_L32X2_XC(d1, dp, inc); /* r3, l3 */
 
-		/* Compute FIR filter for current channel with four
-		 * taps per every loop iteration.  Four coefficients
-		 * are loaded simultaneously. Data is read
-		 * from interleaved buffer with stride of channels
-		 * count.
+		/* Accumulate
+		 * data2_h * coef4_1 + data2_l * coef4_0.
 		 */
-		for (i = 0; i < taps_div_4; i++) {
-			/* Load four coefficients */
-			AE_LA16X4_IP(coef4, u, coefp);
-
-			/* Load two data samples from two channels */
-			AE_L64_XC(d0, dp, inc); /* r0, l0 */
-			AE_L64_XC(d1, dp, inc); /* r1, l1 */
-
-			/* Select to data2 sequential samples from a channel
-			 * and then accumulate to a0 and a1
-			 * data2_h * coef4_3 + data2_l * coef4_2.
-			 * The data is 32 bits Q1.31 and coefficient 16 bits
-			 * Q1.15. The accumulators are Q17.47.
-			 */
-			data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */
-			AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
-			data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */
-			AE_MULAAFD32X16_H3_L2(a1, data2, coef4);
-
-			/* Load two data samples from two channels */
-			AE_L64_XC(d0, dp, inc); /* r2, l2 */
-			AE_L64_XC(d1, dp, inc); /* r3, l3 */
-
-			/* Accumulate
-			 * data2_h * coef4_1 + data2_l * coef4_0.
-			 */
-			data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */
-			AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
-			data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */
-			AE_MULAAFD32X16_H1_L0(a1, data2, coef4);
-		}
+		data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */
+		AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
+		data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */
+		AE_MULAAFD32X16_H1_L0(a1, data2, coef4);
+	}
 
-		/* Scale FIR output with right shifts, round/saturate
-		 * to Q1.31, and store 32 bit output.
-		 */
-		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
-			    sizeof(int32_t));
-		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
-			    sizeof(int32_t));
+	/* Scale FIR output with right shifts, round/saturate
+	 * to Q1.31, and store 32 bit output.
+	 */
+	AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, sizeof(int32_t));
+	AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, sizeof(int32_t));
+}
 
-		return;
-	}
+static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+			      const int taps_div_4, const int shift,
+			      const int nch)
+{
+	/* This function uses
+	 * 6x 64 bit registers
+	 * 3x integers
+	 * 3x address pointers,
+	 */
+	ae_f64 a0;
+	ae_valign u;
+	ae_f16x4 coef4;
+	ae_f32x2 d0;
+	ae_f32x2 d1;
+	ae_f32x2 data2;
+	ae_f16x4 *coefp;
+	ae_f32 *dp0;
+	ae_f32 *dp1;
+	int i;
+	int j;
+	ae_f32 *wp = wp0;
+	const int inc = nch * sizeof(int32_t);
 
 	dp1 = (ae_f32 *)rp;
 	for (j = 0; j < nch; j++) {
@@ -124,6 +142,7 @@ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
 		/* Reset coefficient pointer and clear accumulator */
 		coefp = (ae_f16x4 *)cp;
 		a0 = AE_ZERO64();
+		u = AE_LA64_PP(coefp);
 
 		/* Compute FIR filter for current channel with four
 		 * taps per every loop iteration. Data is read from
@@ -169,139 +188,144 @@ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
 
 #else /* 32bit coefficients version */
 
-static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
-			      const int taps_div_4, const int shift,
-			      const int nch)
+static inline void fir_filter_2ch(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+				  const int taps_div_4, const int shift)
 {
-	/* This function uses
-	 * 6x 64 bit registers
-	 * 3x integers
-	 * 5x address pointers,
-	 */
-	ae_f64 a0;
-	ae_f64 a1;
-	ae_f24x2 data2;
-	ae_f24x2 coef2;
-	ae_f24x2 d0;
-	ae_f24x2 d1;
-	ae_f24x2 *coefp;
-	ae_f24x2 *dp;
-	ae_f24 *dp1;
-	ae_f24 *dp0;
-	int i;
-	int j;
+	ae_f64 a0 = AE_ZERO64();
+	ae_f64 a1 = AE_ZERO64();
+	ae_valign coef_align;
+	ae_f32x2 coef32;
+	ae_f32x2 coef10;
+	ae_f32x2 sample10;
+	ae_f32x2 f0;
+	ae_f32x2 f1;
+	ae_int32x2 *coefp;
+	ae_f32x2 *dp;
 	ae_f32 *wp = wp0;
-	const int inc = nch * sizeof(int32_t);
+	const int inc = 2 * sizeof(int32_t);
+	int i;
 
-	if (nch == 2) {
-		/* Move data pointer back by one sample to start from right
-		 * channel sample. Discard read value p0.
+	/* Move data pointer back by one sample to start from right
+	 * channel sample. Discard read value p0.
+	 */
+	dp = (ae_f32x2 *)rp;
+	AE_L32_XC(f0, (ae_f32 *)dp, -sizeof(ae_f32));
+
+	/* Reset coefficient pointer and clear accumulator */
+	coefp = (ae_int32x2 *)cp;
+	coef_align = AE_LA64_PP(coefp);
+
+	/* Compute FIR filter for current channel with four
+	 * taps per every loop iteration.  Two coefficients
+	 * are loaded simultaneously. Data is read
+	 * from interleaved buffer with stride of channels
+	 * count.
+	 */
+	for (i = 0; i < taps_div_4; i++) {
+		/* Load four coefficients */
+		AE_LA32X2_IP(coef10, coef_align, coefp);
+		AE_LA32X2_IP(coef32, coef_align, coefp);
+
+		/* Load two data samples from two channels. Note: Due to
+		 * polyphase array data start shift for sub-filters can't
+		 * use 128 bits load due to align requirement.
 		 */
-		dp = (ae_f24x2 *)rp;
-		AE_L32F24_XC(d0, (ae_f24 *)dp, -sizeof(ae_f24));
+		AE_L32X2_XC(f0, dp, inc); /* f0.h is left0, f0.l is right0 */
+		AE_L32X2_XC(f1, dp, inc); /* f1.h is left1, f1.l is right1 */
 
-		/* Reset coefficient pointer and clear accumulator */
-		coefp = (ae_f24x2 *)cp;
-		a0 = AE_ZERO64();
-		a1 = AE_ZERO64();
-
-		/* Compute FIR filter for current channel with four
-		 * taps per every loop iteration.  Two coefficients
-		 * are loaded simultaneously. Data is read
-		 * from interleaved buffer with stride of channels
-		 * count.
+		/* a0 (left)  += left10.h  (left1)  * coef10.h (coef2)
+		 *            += left10.l  (left0)  * coef10.l (coef1)
+		 * a1 (right) += right10.h (right1) * coef10.h (coef2)
+		 *            += right10.l (right0) * coef10.l (coef1)
 		 */
-		for (i = 0; i < taps_div_4; i++) {
-			/* Load two coefficients. Coef2_h contains tap *coefp
-			 * and coef2_l contains the next tap.
-			 */
-			/* TODO: Ensure coefficients are 64 bits aligned */
-			AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
-
-			/* Load two data samples from two channels */
-			AE_L32X2F24_XC(d0, dp, inc); /* r0, l0 */
-			AE_L32X2F24_XC(d1, dp, inc); /* r1, l1 */
-
-			/* Select to data2 two successive left channel samples
-			 * from d0 and d1, multiply-add and accumulate to a0.
-			 * Select to data2 two successive right channel samples
-			 * from d0 and d1, multiply-add and accumulate to a1.
-			 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
-			 * data and Q1.15 coefficients are used as 24 bits as
-			 * Q1.23 values.
-			 */
-			data2 = AE_SELP24_LL(d0, d1);
-			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
-			data2 = AE_SELP24_HH(d0, d1);
-			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
-
-			/* Repeat for next two taps */
-			AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
-			AE_L32X2F24_XC(d0, dp, inc); /* r2, l2 */
-			AE_L32X2F24_XC(d1, dp, inc); /* r3, l3 */
-			data2 = AE_SELP24_LL(d0, d1);
-			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
-			data2 = AE_SELP24_HH(d0, d1);
-			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
-		}
+		sample10 = AE_SEL32_HH(f0, f1);
+		AE_MULAAFD32RA_HH_LL(a0, sample10, coef10);
+		sample10 = AE_SEL32_LL(f0, f1);
+		AE_MULAAFD32RA_HH_LL(a1, sample10, coef10);
+
+		/* Repeat for next two taps */
+		AE_L32X2_XC(f0, dp, inc); /* f0.h is left2, f0.l is right2 */
+		AE_L32X2_XC(f1, dp, inc); /* f1.h is left3, f1.l is right3 */
+		sample10 = AE_SEL32_HH(f0, f1);
+		AE_MULAAFD32RA_HH_LL(a0, sample10, coef32);
+		sample10 = AE_SEL32_LL(f0, f1);
+		AE_MULAAFD32RA_HH_LL(a1, sample10, coef32);
+	}
 
-		/* Scale FIR output with right shifts, round/saturate
-		 * to Q1.31, and store 32 bit output.
-		 */
-		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
-			    sizeof(int32_t));
-		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
-			    sizeof(int32_t));
+	/* Scale FIR output with right shifts, round/saturate
+	 * to Q1.31, and store 32 bit output.
+	 */
+	f0 = AE_ROUND32X2F48SASYM(AE_SRAA64(a1, shift), AE_SRAA64(a0, shift));
+	AE_S32X2_I(f0, (ae_int32x2 *)wp, 0);
+}
 
-		return;
-	}
+static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+			      const int taps_div_4, const int shift,
+			      const int nch)
+{
+	ae_f64 a0 = AE_ZERO64();
+	ae_valign coef_align;
+	ae_f32x2 coef32;
+	ae_f32x2 coef10;
+	ae_f32x2 sample10;
+	ae_f32x2 f0;
+	ae_f32x2 f1;
+	ae_int32x2 *coefp;
+	ae_f32 *dp;
+	ae_f32 *dp1 = rp;
+	ae_f32 *wp = wp0;
+	const int inc = nch * sizeof(int32_t);
+	int i;
+	int j;
 
-	dp1 = (ae_f24 *)rp;
 	for (j = 0; j < nch; j++) {
 		/* Copy pointer and advance to next ch with dummy load */
-		dp0 = dp1;
-		AE_L32F24_XC(data2, dp1, -sizeof(ae_f24));
+		dp = dp1;
+		AE_L32_XC(f0, dp1, -sizeof(ae_f32));
 
 		/* Reset coefficient pointer and clear accumulator */
-		coefp = (ae_f24x2 *)cp;
+		coefp = (ae_int32x2 *)cp;
 		a0 = AE_ZERO64();
+		coef_align = AE_LA64_PP(coefp);
 
 		/* Compute FIR filter for current channel with four
 		 * taps per every loop iteration. Data is read from
 		 * interleaved buffer with stride of channels count.
 		 */
 		for (i = 0; i < taps_div_4; i++) {
-			/* Load two coefficients */
-			coef2 = *coefp++;
+			/* Load four coefficients
+			 * TODO: Ensure coefficients are 128 bits aligned
+			 */
+			AE_LA32X2_IP(coef10, coef_align, coefp);
+			AE_LA32X2_IP(coef32, coef_align, coefp);
 
 			/* Load two data samples, place to high and
 			 * low of data2.
 			 */
-			AE_L32F24_XC(d0, dp0, inc);
-			AE_L32F24_XC(d1, dp0, inc);
-			data2 = AE_SELP24_LL(d0, d1);
+			AE_L32_XC(f0, dp, inc);
+			AE_L32_XC(f1, dp, inc);
+			sample10 = AE_SEL32_LL(f0, f1);
 
 			/* Accumulate to data2_h * coef2_h +
 			 * data2_l*coef2_l. The Q1.31 bit data is used
 			 * as Q1.23 from MSB side bits of the 32 bit
 			 * word. The accumulator m is Q17.47.
 			 */
-			AE_MULAAFD24_HH_LL(a0, data2, coef2);
+			AE_MULAAFD32RA_HH_LL(a0, sample10, coef10);
 
 			/* Repeat the same for next two filter taps */
-			coef2 = *coefp++;
-			AE_L32F24_XC(d0, dp0, inc);
-			AE_L32F24_XC(d1, dp0, inc);
-			data2 = AE_SELP24_LL(d0, d1);
-			AE_MULAAFD24_HH_LL(a0, data2, coef2);
+			AE_L32_XC(f0, dp, inc);
+			AE_L32_XC(f1, dp, inc);
+			sample10 = AE_SEL32_LL(f0, f1);
+			AE_MULAAFD32RA_HH_LL(a0, sample10, coef32);
 		}
 
 		/* Scale FIR output with right shifts, round/saturate Q17.47
 		 * to Q1.31, and store 32 bit output. Advance write
 		 * pointer to next sample.
 		 */
-		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
-			    sizeof(int32_t));
+		f0 = AE_ROUND32F48SASYM(AE_SRAA64(a0, shift));
+		AE_S32_L_XP(f0, wp, sizeof(int32_t));
 	}
 }
 
@@ -315,7 +339,7 @@ void src_polyphase_stage_cir(struct src_stage_prm *s)
 	 * 16x integers
 	 * 11x address pointers,
 	 */
-	ae_int32x2 q = AE_ZERO32();
+	ae_int32x2 q;
 	ae_f32 *rp;
 	ae_f32 *wp;
 	int i;
@@ -342,12 +366,7 @@ void src_polyphase_stage_cir(struct src_stage_prm *s)
 	int32_t *y_wptr = (int32_t *)s->y_wptr;
 	int32_t *x_end_addr = (int32_t *)s->x_end_addr;
 	int32_t *y_end_addr = (int32_t *)s->y_end_addr;
-
-#if SRC_SHORT
-	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
-#else
-	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
-#endif
+	const size_t subfilter_size = cfg->subfilter_length * SRC_COEF_SIZE;
 
 	for (n = 0; n < s->times; n++) {
 		/* Input data to filter */
@@ -388,17 +407,25 @@ void src_polyphase_stage_cir(struct src_stage_prm *s)
 		 * sub-filters.
 		 */
 		wp = (ae_f32 *)fir->out_rp;
-		for (i = 0; i < cfg->num_of_subfilters; i++) {
-			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
-			wp += nch_x_odm;
-			cp = (uint8_t *)cp + subfilter_size;
-			src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
-
-			/* Circular advance pointer rp by number of
-			 * channels x input delay multiplier. Loaded value q
-			 * is discarded.
-			 */
-			AE_L32_XC(q, rp, nch_x_idm_sz);
+		if (nch == 2) {
+			for (i = 0; i < cfg->num_of_subfilters; i++) {
+				fir_filter_2ch(rp, cp, wp, taps_div_4, cfg->shift);
+				wp += nch_x_odm;
+				cp = (uint8_t *)cp + subfilter_size;
+				src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
+				/* Circular advance pointer rp by number of channels x input
+				 * delay multiplier. Loaded value q is discarded.
+				 */
+				AE_L32_XC(q, rp, nch_x_idm_sz);
+			}
+		} else {
+			for (i = 0; i < cfg->num_of_subfilters; i++) {
+				fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+				wp += nch_x_odm;
+				cp = (uint8_t *)cp + subfilter_size;
+				src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
+				AE_L32_XC(q, rp, nch_x_idm_sz);
+			}
 		}
 
 		/* Output */
@@ -464,12 +491,7 @@ void src_polyphase_stage_cir_s16(struct src_stage_prm *s)
 	int16_t *y_wptr = (int16_t *)s->y_wptr;
 	int16_t *x_end_addr = (int16_t *)s->x_end_addr;
 	int16_t *y_end_addr = (int16_t *)s->y_end_addr;
-
-#if SRC_SHORT
-	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
-#else
-	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
-#endif
+	const size_t subfilter_size = cfg->subfilter_length * SRC_COEF_SIZE;
 
 	for (n = 0; n < s->times; n++) {
 		/* Input data */
@@ -509,17 +531,25 @@ void src_polyphase_stage_cir_s16(struct src_stage_prm *s)
 		 * sub-filters.
 		 */
 		wp = (ae_f32 *)fir->out_rp;
-		for (i = 0; i < cfg->num_of_subfilters; i++) {
-			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
-			wp += nch_x_odm;
-			cp = (uint8_t *)cp + subfilter_size;
-			src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
-
-			/* Circular advance pointer rp by number of
-			 * channels x input delay multiplier. Loaded value q
-			 * is discarded.
-			 */
-			AE_L32_XC(q, rp, nch_x_idm_sz);
+		if (nch == 2) {
+			for (i = 0; i < cfg->num_of_subfilters; i++) {
+				fir_filter_2ch(rp, cp, wp, taps_div_4, cfg->shift);
+				wp += nch_x_odm;
+				cp = (uint8_t *)cp + subfilter_size;
+				src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
+				/* Circular advance pointer rp by number of channels x input delay
+				 * multiplier. Loaded value q is discarded.
+				 */
+				AE_L32_XC(q, rp, nch_x_idm_sz);
+			}
+		} else {
+			for (i = 0; i < cfg->num_of_subfilters; i++) {
+				fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+				wp += nch_x_odm;
+				cp = (uint8_t *)cp + subfilter_size;
+				src_inc_wrap((int32_t **)&wp, out_delay_end, out_size);
+				AE_L32_XC(q, rp, nch_x_idm_sz);
+			}
 		}
 
 		/* Output */