From 906b0b93604d5231f7f96a9a310bb3b9c0596a47 Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Mon, 3 Feb 2025 14:16:22 +0200
Subject: [PATCH 1/3] Math: IIR DF1: Add a simplified 4th order IIR process
 function

The 4th filter with two biquads in series is commonly used in
crossover and multiband DRC components. The omitting of outer
loop for parallel biquads and check for null coefficients and
use of fixed loop count of two makes the critical code faster.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/include/sof/math/iir_df1.h | 17 +++++++++
 src/math/iir_df1_generic.c     | 54 +++++++++++++++++++++++++++
 src/math/iir_df1_hifi3.c       | 68 ++++++++++++++++++++++++++++++++++
 src/math/iir_df1_hifi4.c       | 61 ++++++++++++++++++++++++++++++
 src/math/iir_df1_hifi5.c       | 59 +++++++++++++++++++++++++++++
 5 files changed, 259 insertions(+)

diff --git a/src/include/sof/math/iir_df1.h b/src/include/sof/math/iir_df1.h
index fc5e034b3b4b..24653f690ea3 100644
--- a/src/include/sof/math/iir_df1.h
+++ b/src/include/sof/math/iir_df1.h
@@ -13,6 +13,7 @@
 #include <sof/common.h>
 
 #define IIR_DF1_NUM_STATE 4
+#define SOF_IIR_DF1_4TH_NUM_BIQUADS 2
 
 struct iir_state_df1 {
 	unsigned int biquads; /* Number of IIR 2nd order sections total */
@@ -34,8 +35,24 @@ void iir_init_delay_df1(struct iir_state_df1 *iir, int32_t **state);
 
 void iir_reset_df1(struct iir_state_df1 *iir);
 
+/**
+ * Calculate IIR filter consisting of biquads
+ * @param iir	IIR state with configured biquad coefficients and delay lines data
+ * @param x	Single s32 Q1.31 format input sample
+ * @return	Single s32 Q1.31 format output samples
+ */
 int32_t iir_df1(struct iir_state_df1 *iir, int32_t x);
 
+/**
+ * Calculate IIR filter consisting of biquads, special simplified version for
+ * 4th order filter with two biquads in series. Note: There are no checks for
+ * iir struct members.
+ * @param iir	IIR state with configured biquad coefficients and delay lines data
+ * @param x	Single s32 Q1.31 format input sample
+ * @return	Single s32 Q1.31 format output samples
+ */
+int32_t iir_df1_4th(struct iir_state_df1 *iir, int32_t x);
+
 /* Inline functions */
 #if SOF_USE_MIN_HIFI(3, FILTER)
 #include "iir_df1_hifi3.h"
diff --git a/src/math/iir_df1_generic.c b/src/math/iir_df1_generic.c
index 6e6482259569..4b069eafb3d7 100644
--- a/src/math/iir_df1_generic.c
+++ b/src/math/iir_df1_generic.c
@@ -109,4 +109,58 @@ int32_t iir_df1(struct iir_state_df1 *iir, int32_t x)
 }
 EXPORT_SYMBOL(iir_df1);
 
+int32_t iir_df1_4th(struct iir_state_df1 *iir, int32_t x)
+{
+	int32_t in;
+	int32_t tmp;
+	int64_t acc;
+	int i;
+	int d = 0; /* Index to state */
+	int c = 0; /* Index to coefficient a2 */
+	int32_t *coefp = iir->coef;
+	int32_t *delay = iir->delay;
+
+	/* Coefficients order in coef[] is {a2, a1, b2, b1, b0, shift, gain} */
+	/* Delay order in state[] is {y(n - 2), y(n - 1), x(n - 2), x(n - 1)} */
+	in = x;
+	for (i = 0; i < SOF_IIR_DF1_4TH_NUM_BIQUADS; i++) {
+		/* Compute output: Delay is Q3.61
+		 * Q2.30 x Q1.31 -> Q3.61
+		 * Shift Q3.61 to Q3.31 with rounding, saturate to Q1.31
+		 */
+		acc = ((int64_t)coefp[c]) * delay[d]; /* a2 * y(n - 2) */
+		acc += ((int64_t)coefp[c + 1]) * delay[d + 1]; /* a1 * y(n - 1) */
+		acc += ((int64_t)coefp[c + 2]) * delay[d + 2]; /* b2 * x(n - 2) */
+		acc += ((int64_t)coefp[c + 3]) * delay[d + 3]; /* b1 * x(n - 1) */
+		acc += ((int64_t)coefp[c + 4]) * in; /* b0 * x */
+		tmp = (int32_t)sat_int32(Q_SHIFT_RND(acc, 61, 31));
+
+		/* update the delay value */
+		delay[d] = delay[d + 1];
+		delay[d + 1] = tmp;
+		delay[d + 2] = delay[d + 3];
+		delay[d + 3] = in;
+
+		/* Apply gain Q2.14 x Q1.31 -> Q3.45 */
+		acc = ((int64_t)coefp[c + 6]) * tmp; /* Gain */
+
+		/* Apply biquad output shift right parameter
+		 * simultaneously with Q3.45 to Q3.31 conversion. Then
+		 * saturate to 32 bits Q1.31 and prepare for next
+		 * biquad.
+		 */
+		acc = Q_SHIFT_RND(acc, 45 + coefp[c + 5], 31);
+		in = sat_int32(acc);
+
+		/* Proceed to next biquad coefficients and delay
+		 * lines.
+		 */
+		c += SOF_EQ_IIR_NBIQUAD;
+		d += IIR_DF1_NUM_STATE;
+	}
+	/* Output of previous section is in variable in */
+	return in;
+}
+EXPORT_SYMBOL(iir_df1_4th);
+
 #endif
diff --git a/src/math/iir_df1_hifi3.c b/src/math/iir_df1_hifi3.c
index 7c1237f55f79..eddcc3f980ea 100644
--- a/src/math/iir_df1_hifi3.c
+++ b/src/math/iir_df1_hifi3.c
@@ -126,4 +126,72 @@ int32_t iir_df1(struct iir_state_df1 *iir, int32_t x)
 }
 EXPORT_SYMBOL(iir_df1);
 
+int32_t iir_df1_4th(struct iir_state_df1 *iir, int32_t x)
+{
+	ae_int64 acc;
+	ae_valign coef_align;
+	ae_int32x2 coef_a2a1;
+	ae_int32x2 coef_b2b1;
+	ae_int32x2 coef_b0;
+	ae_int32x2 gain;
+	ae_int32x2 shift;
+	ae_int32x2 delay_y2y1;
+	ae_int32x2 delay_x2x1;
+	ae_int32 in;
+	ae_int32 tmp;
+	ae_int32x2 *coefp;
+	ae_int32x2 *delayp;
+	int32_t *delay_update;
+	int i;
+
+	/* Coefficients order in coef[] is {a2, a1, b2, b1, b0, shift, gain} */
+	coefp = (ae_int32x2 *)&iir->coef[0];
+	delayp = (ae_int32x2 *)&iir->delay[0];
+	in = x;
+	for (i = 0; i < SOF_IIR_DF1_4TH_NUM_BIQUADS; i++) {
+		/* Compute output: Delay is kept Q17.47 while multiply
+		 * instruction gives Q2.30 x Q1.31 -> Q18.46. Need to
+		 * shift delay line values right by one for same align
+		 * as MAC. Store to delay line need to be shifted left
+		 * by one similarly.
+		 */
+		coef_align = AE_LA64_PP(coefp);
+		AE_LA32X2_IP(coef_a2a1, coef_align, coefp);
+		AE_LA32X2_IP(coef_b2b1, coef_align, coefp);
+		AE_L32_IP(coef_b0, (ae_int32 *)coefp, 4);
+		AE_L32_IP(shift, (ae_int32 *)coefp, 4);
+		AE_L32_IP(gain, (ae_int32 *)coefp, 4);
+
+		AE_L32X2_IP(delay_y2y1, delayp, 8);
+		AE_L32X2_IP(delay_x2x1, delayp, 8);
+
+		acc = AE_MULF32R_HH(coef_a2a1, delay_y2y1); /* a2 * y(n - 2) */
+		AE_MULAF32R_LL(acc, coef_a2a1, delay_y2y1); /* a1 * y(n - 1) */
+		AE_MULAF32R_HH(acc, coef_b2b1, delay_x2x1); /* b2 * x(n - 2) */
+		AE_MULAF32R_LL(acc, coef_b2b1, delay_x2x1); /* b1 * x(n - 1) */
+		AE_MULAF32R_HH(acc, coef_b0, in); /*  b0 * x  */
+		acc = AE_SLAI64S(acc, 1); /* Convert to Q17.47 */
+		tmp = AE_ROUND32F48SSYM(acc); /* Round to Q1.31 */
+
+		/* update the state value */
+		delay_update = (int32_t *)delayp - 4;
+		delay_update[0] = delay_update[1];
+		delay_update[1] = tmp;
+		delay_update[2] = delay_update[3];
+		delay_update[3] = in;
+
+		/* Apply gain Q18.14 x Q1.31 -> Q34.30 */
+		acc = AE_MULF32R_HH(gain, tmp); /* Gain */
+		acc = AE_SLAI64S(acc, 17); /* Convert to Q17.47 */
+
+		/* Apply biquad output shift right parameter and then
+		 * round and saturate to 32 bits Q1.31.
+		 */
+		acc = AE_SRAA64(acc, shift);
+		in = AE_ROUND32F48SSYM(acc);
+	}
+	return in;
+}
+EXPORT_SYMBOL(iir_df1_4th);
+
 #endif
diff --git a/src/math/iir_df1_hifi4.c b/src/math/iir_df1_hifi4.c
index 07a4a495369d..945fd67af5bc 100644
--- a/src/math/iir_df1_hifi4.c
+++ b/src/math/iir_df1_hifi4.c
@@ -119,4 +119,65 @@ int32_t iir_df1(struct iir_state_df1 *iir, int32_t x)
 }
 EXPORT_SYMBOL(iir_df1);
 
+int32_t iir_df1_4th(struct iir_state_df1 *iir, int32_t x)
+{
+	ae_valign coef_align;
+	ae_valign data_r_align;
+	ae_valign data_w_align = AE_ZALIGN64();
+	ae_f64 acc;
+	ae_int32x2 delay_y2y1;
+	ae_int32x2 delay_x2x1;
+	ae_int32x2 coef_a2a1;
+	ae_int32x2 coef_b2b1;
+	ae_int32x2 coef_b0;
+	ae_int32x2 gain;
+	ae_int32x2 shift;
+	ae_int32 in;
+	ae_int32x2 *coefp = (ae_int32x2 *)iir->coef;
+	ae_int32x2 *delay_r  = (ae_int32x2 *)iir->delay;
+	ae_int32x2 *delay_w = delay_r;
+	int i;
+
+	/* Coefficients order in coef[] is {a2, a1, b2, b1, b0, shift, gain} */
+	/* Delay order in state[] is {y(n - 2), y(n - 1), x(n - 2), x(n - 1)} */
+	data_r_align = AE_LA64_PP(delay_r);
+	in = x;
+	for (i = 0; i < SOF_IIR_DF1_4TH_NUM_BIQUADS; i++) {
+		/* Load data */
+		AE_LA32X2_IP(delay_y2y1, data_r_align, delay_r);
+		AE_LA32X2_IP(delay_x2x1, data_r_align, delay_r);
+
+		/* Load coefficients */
+		coef_align = AE_LA64_PP(coefp);
+		AE_LA32X2_IP(coef_a2a1, coef_align, coefp);
+		AE_LA32X2_IP(coef_b2b1, coef_align, coefp);
+		AE_L32_IP(coef_b0, (ae_int32 *)coefp, 4);
+		AE_L32_IP(shift, (ae_int32 *)coefp, 4);
+		AE_L32_IP(gain, (ae_int32 *)coefp, 4);
+
+		acc = AE_MULF32RA_HH(coef_b0, in);		  /* acc = b0 * in */
+		AE_MULAAFD32RA_HH_LL(acc, coef_a2a1, delay_y2y1); /* + a2 * y2 + a1 * y1 */
+		AE_MULAAFD32RA_HH_LL(acc, coef_b2b1, delay_x2x1); /* + b2 * x2 + b1 * x1 */
+		AE_PKSR32(delay_y2y1, acc, 1);		     /* y2 = y1, y1 = acc(q1.31) */
+		delay_x2x1 = AE_SEL32_LL(delay_x2x1, in);    /* x2 = x1, x1 = in */
+
+		/* Store data */
+		AE_SA32X2_IP(delay_y2y1, data_w_align, delay_w);
+		AE_SA32X2_IP(delay_x2x1, data_w_align, delay_w);
+
+		/* Apply gain */
+		acc = AE_MULF32R_LL(gain, delay_y2y1);	/* acc = gain * y1 */
+		acc = AE_SLAI64S(acc, 17);		/* Convert to Q17.47 */
+
+		/* Apply biquad output shift right parameter and then
+		 * round and saturate to 32 bits Q1.31.
+		 */
+		acc = AE_SRAA64(acc, shift);
+		in = AE_ROUND32F48SSYM(acc);
+	}
+	AE_SA64POS_FP(data_w_align, delay_w);
+	return in;
+}
+EXPORT_SYMBOL(iir_df1_4th);
+
 #endif
diff --git a/src/math/iir_df1_hifi5.c b/src/math/iir_df1_hifi5.c
index 262cb5120bff..ca331d28c7fa 100644
--- a/src/math/iir_df1_hifi5.c
+++ b/src/math/iir_df1_hifi5.c
@@ -116,4 +116,63 @@ int32_t iir_df1(struct iir_state_df1 *iir, int32_t x)
 }
 EXPORT_SYMBOL(iir_df1);
 
+int32_t iir_df1_4th(struct iir_state_df1 *iir, int32_t x)
+{
+	ae_valignx2 coef_align;
+	ae_valignx2 data_r_align;
+	ae_valignx2 data_w_align = AE_ZALIGN128();
+	ae_f64 acc;
+	ae_int32x2 delay_y2y1;
+	ae_int32x2 delay_x2x1;
+	ae_int32x2 coef_a2a1;
+	ae_int32x2 coef_b2b1;
+	ae_int32x2 coef_b0;
+	ae_int32x2 gain;
+	ae_int32x2 shift;
+	ae_int32 in;
+	ae_int32x4 *coefp = (ae_int32x4 *)iir->coef;
+	ae_int32x4 *delay_r  = (ae_int32x4 *)iir->delay;
+	ae_int32x4 *delay_w = delay_r;
+	int i;
+
+	/* Coefficients order in coef[] is {a2, a1, b2, b1, b0, shift, gain} */
+	/* Delay order in state[] is {y(n - 2), y(n - 1), x(n - 2), x(n - 1)} */
+	data_r_align = AE_LA128_PP(delay_r);
+	in = x;
+	for (i = 0; i < SOF_IIR_DF1_4TH_NUM_BIQUADS; i++) {
+		/* Load data */
+		AE_LA32X2X2_IP(delay_y2y1, delay_x2x1, data_r_align, delay_r);
+
+		/* Load coefficients */
+		coef_align = AE_LA128_PP(coefp);
+		AE_LA32X2X2_IP(coef_a2a1, coef_b2b1, coef_align, coefp);
+		AE_L32_IP(coef_b0, (ae_int32 *)coefp, 4);
+		AE_L32_IP(shift, (ae_int32 *)coefp, 4);
+		AE_L32_IP(gain, (ae_int32 *)coefp, 4);
+
+		acc = AE_MULF32RA_HH(coef_b0, in);		  /* acc = b0 * in */
+		AE_MULAAFD32RA_HH_LL(acc, coef_a2a1, delay_y2y1); /* + a2 * y2 + a1 * y1 */
+		AE_MULAAFD32RA_HH_LL(acc, coef_b2b1, delay_x2x1); /* + b2 * x2 + b1 * x1 */
+		AE_PKSR32(delay_y2y1, acc, 1);		     /* y2 = y1, y1 = acc(q1.31) */
+		delay_x2x1 = AE_SEL32_LL(delay_x2x1, in);   /* x2 = x1, x1 = in */
+
+		/* Store data */
+		AE_SA32X2X2_IP(delay_y2y1, delay_x2x1, data_w_align, delay_w);
+
+		/* Apply gain */
+		acc = AE_MULF32R_LL(gain, delay_y2y1);	/* acc = gain * y1 */
+		acc = AE_SLAI64S(acc, 17);		/* Convert to Q17.47 */
+
+		/* Apply biquad output shift right parameter and then
+		 * round and saturate to 32 bits Q1.31.
+		 */
+		acc = AE_SRAA64(acc, shift);
+		in = AE_ROUND32F48SSYM(acc);
+	}
+
+	AE_SA128POS_FP(data_w_align, delay_w);
+	return in;
+}
+EXPORT_SYMBOL(iir_df1_4th);
+
 #endif

From 6389eff7eaa897a78c791232211251c3bf57f646 Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Mon, 3 Feb 2025 20:00:48 +0200
Subject: [PATCH 2/3] Audio: Crossover: User simpler iir_df1_4th() IIR function

This patch changes crossover component to use the optimized
4th order IIR function. The LR4 (Linkwitz-Riley 4th order)
filter bank is hard-coded to 4th order, so this change does
no add restrictions.

The filter bank is used by multiband DRC component. The saving
in three bands configuration in a HiFi5 platform is 5.2 MCPS,
from 90.36 MCPS to 85.17 MCPS.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/audio/crossover/crossover.c | 8 ++++++--
 src/audio/crossover/crossover.h | 4 +++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/audio/crossover/crossover.c b/src/audio/crossover/crossover.c
index 3a4af0d43314..2511556263c2 100644
--- a/src/audio/crossover/crossover.c
+++ b/src/audio/crossover/crossover.c
@@ -161,6 +161,10 @@ static int crossover_init_coef_lr4(struct sof_eq_iir_biquad *coef,
 {
 	int ret;
 
+	/* Ensure the LR4 can be processed with the simplified 4th order IIR */
+	if (CROSSOVER_LR4_NUM_BIQUADS != SOF_IIR_DF1_4TH_NUM_BIQUADS)
+		return -EINVAL;
+
 	/* Only one set of coefficients is stored in config for both biquads
 	 * in series due to identity. To maintain the structure of
 	 * iir_state_df1, it requires two copies of coefficients in a row.
@@ -190,8 +194,8 @@ static int crossover_init_coef_lr4(struct sof_eq_iir_biquad *coef,
 	if (!lr4->delay)
 		return -ENOMEM;
 
-	lr4->biquads = 2;
-	lr4->biquads_in_series = 2;
+	lr4->biquads = CROSSOVER_LR4_NUM_BIQUADS;
+	lr4->biquads_in_series = CROSSOVER_LR4_NUM_BIQUADS;
 
 	return 0;
 }
diff --git a/src/audio/crossover/crossover.h b/src/audio/crossover/crossover.h
index 64fc86023ad0..2312a1d53857 100644
--- a/src/audio/crossover/crossover.h
+++ b/src/audio/crossover/crossover.h
@@ -15,6 +15,8 @@
 
 #include "crossover_user.h"
 
+#define CROSSOVER_LR4_NUM_BIQUADS 2
+
 struct comp_buffer;
 struct comp_dev;
 
@@ -122,7 +124,7 @@ static inline int32_t crossover_generic_process_lr4(int32_t in,
 						    struct iir_state_df1 *lr4)
 {
 	/* Cascade two biquads with same coefficients in series. */
-	return iir_df1(lr4, in);
+	return iir_df1_4th(lr4, in);
 }
 
 static inline void crossover_free_config(struct sof_crossover_config **config)

From c07023d87e1ee711f79b39266fdd4246bfae6758 Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Mon, 3 Feb 2025 20:04:05 +0200
Subject: [PATCH 3/3] Audio: Multiband DRC: Change emphasis filter to
 iir_df1_4th()

This patch changes in multiband DRC component the emphasis and
de-emphasis IIR filters to use the optimized 4th order IIR code.

The patch for crossover already covered the bands filter bank.
This change saves additional 2 MCPS in a HiFi5 build of the
component. From 85.17 MCPS to  83.44 MCPS.

The change is not restricting configuration. The existing filters
are hard-coded to 4th order (SOF_EMP_DEEMP_BIQUADS).

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/audio/multiband_drc/multiband_drc.c         | 4 ++++
 src/audio/multiband_drc/multiband_drc_generic.c | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/audio/multiband_drc/multiband_drc.c b/src/audio/multiband_drc/multiband_drc.c
index e76741f004fd..6db99f116c1c 100644
--- a/src/audio/multiband_drc/multiband_drc.c
+++ b/src/audio/multiband_drc/multiband_drc.c
@@ -66,6 +66,10 @@ static int multiband_drc_eq_init_coef_ch(struct sof_eq_iir_biquad *coef,
 {
 	int ret;
 
+	/* Ensure the LR4 can be processed with the simplified 4th order IIR */
+	if (SOF_EMP_DEEMP_BIQUADS != SOF_IIR_DF1_4TH_NUM_BIQUADS)
+		return -EINVAL;
+
 	eq->coef = rzalloc(SOF_MEM_ZONE_RUNTIME, 0, SOF_MEM_CAPS_RAM,
 			   sizeof(struct sof_eq_iir_biquad) * SOF_EMP_DEEMP_BIQUADS);
 	if (!eq->coef)
diff --git a/src/audio/multiband_drc/multiband_drc_generic.c b/src/audio/multiband_drc/multiband_drc_generic.c
index bfdb7c6d3381..bd64f5012fc8 100644
--- a/src/audio/multiband_drc/multiband_drc_generic.c
+++ b/src/audio/multiband_drc/multiband_drc_generic.c
@@ -39,7 +39,7 @@ static void multiband_drc_process_emp_crossover(struct multiband_drc_state *stat
 		crossover_s = &state->crossover[ch];
 
 		if (enable_emp)
-			emp_out = iir_df1(emp_s, *buf_src);
+			emp_out = iir_df1_4th(emp_s, *buf_src);
 		else
 			emp_out = *buf_src;
 
@@ -178,7 +178,7 @@ static void multiband_drc_process_deemp(struct multiband_drc_state *state,
 		}
 
 		if (enable_deemp)
-			*buf_sink = iir_df1(deemp_s, mix_out);
+			*buf_sink = iir_df1_4th(deemp_s, mix_out);
 		else
 			*buf_sink = mix_out;