diff --git a/src/audio/src/CMakeLists.txt b/src/audio/src/CMakeLists.txt index bfc6361afedb..111b2484b4da 100644 --- a/src/audio/src/CMakeLists.txt +++ b/src/audio/src/CMakeLists.txt @@ -1,6 +1,6 @@ # SPDX-License-Identifier: BSD-3-Clause -add_local_sources(sof src_generic.c src_hifi2ep.c src_hifi3.c src_hifi4.c src_common.c src.c) +add_local_sources(sof src_generic.c src_hifi2ep.c src_hifi3.c src_hifi4.c src_hifi5.c src_common.c src.c) if(CONFIG_IPC_MAJOR_3) add_local_sources(sof src_ipc3.c) diff --git a/src/audio/src/src_config.h b/src/audio/src/src_config.h index 01a60230093c..9e53704b123f 100644 --- a/src/audio/src/src_config.h +++ b/src/audio/src/src_config.h @@ -8,55 +8,24 @@ #ifndef __SOF_AUDIO_SRC_SRC_CONFIG_H__ #define __SOF_AUDIO_SRC_SRC_CONFIG_H__ - - -/* If next define is set to 1 the SRC is configured automatically. Setting - * to zero temporarily is useful is for testing needs. - */ -#define SRC_AUTOARCH 1 - -/* Force manually some code variant when SRC_AUTODSP is set to zero. These - * are useful in code debugging. - */ -#if SRC_AUTOARCH == 0 -#define SRC_GENERIC 1 -#define SRC_HIFIEP 0 -#define SRC_HIFI3 0 -#endif - -/* Select optimized code variant when xt-xcc compiler is used */ -#if SRC_AUTOARCH == 1 -#if defined __XCC__ -#include -#define SRC_GENERIC 0 -#if XCHAL_HAVE_HIFI4 == 1 -#define SRC_HIFI4 1 -#define SRC_HIFI3 0 -#define SRC_HIFIEP 0 +#include + +/* Follow kconfig for FILTER in SRC component */ +#if SOF_USE_MIN_HIFI(5, FILTER) +#define SRC_HIFI5 1 +#elif SOF_USE_MIN_HIFI(4, FILTER) +#define SRC_HIFI4 1 +#elif SOF_USE_MIN_HIFI(3, FILTER) +#define SRC_HIFI3 1 +#elif SOF_USE_HIFI(2, FILTER) +#define SRC_HIFIEP 1 #else -#define SRC_HIFI4 0 -#if XCHAL_HAVE_HIFI2EP == 1 -#define SRC_HIFIEP 1 -#define SRC_HIFI3 0 -#endif -#if XCHAL_HAVE_HIFI3 == 1 -#define SRC_HIFI3 1 -#define SRC_HIFIEP 0 -#endif -#endif -#else -/* GCC */ -#define SRC_GENERIC 1 -#define SRC_HIFIEP 0 -#define SRC_HIFI3 0 -#if CONFIG_LIBRARY -#else -#define SRC_SHORT 1 /* Need to use for generic code version speed */ -#endif -#endif +#define SRC_GENERIC 1 #endif -/* Kconfig option tiny needs 16 bits coefficients, other options use 32 bits */ +/* Kconfig option tiny needs 16 bits coefficients, other options use 32 bits, + * also gcc builds for all platforms and testbench (library) + */ #if !defined(SRC_SHORT) #if CONFIG_COMP_SRC_TINY #define SRC_SHORT 1 /* 16 bit coefficients filter core */ diff --git a/src/audio/src/src_hifi4.c b/src/audio/src/src_hifi4.c index 1106968f4085..ad3b8583113f 100644 --- a/src/audio/src/src_hifi4.c +++ b/src/audio/src/src_hifi4.c @@ -1,10 +1,11 @@ // SPDX-License-Identifier: BSD-3-Clause // -// Copyright(c) 2022 Intel Corporation. All rights reserved. +// Copyright(c) 2022-2025 Intel Corporation. // // Author: Krzysztof Frydryk +// Seppo Ingalsuo -/* HiFi4 optimized code parts for SRC */ +/* HiFi5 optimized code parts for SRC */ #include "src_config.h" @@ -23,20 +24,21 @@ #include #endif /* __ZEPHYR__ */ -/* HiFi4 has - * 16x 64 bit registers in register file AE_DR - */ +#if SRC_SHORT +#define SRC_COEF_SIZE sizeof(int16_t) +#else +#define SRC_COEF_SIZE sizeof(int32_t) +#endif #if SRC_SHORT /* 16 bit coefficients version */ -static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, - const int taps_div_4, const int shift, - const int nch) +static inline void fir_filter_2ch(ae_f32 *rp, const void *cp, ae_f32 *wp0, + const int taps_div_4, const int shift) { /* This function uses - * 6x 64 bit registers - * 3x integers - * 5x address pointers, + * 7x 64 bit registers + * 2x integers + * 3x address pointers, */ ae_f64 a0; ae_f64 a1; @@ -47,73 +49,89 @@ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, ae_f32x2 data2; ae_f16x4 *coefp; ae_f32x2 *dp; - ae_f32 *dp0; - ae_f32 *dp1; int i; - int j; ae_f32 *wp = wp0; - const int inc = nch * sizeof(int32_t); + const int inc = 2 * sizeof(int32_t); - if (nch == 2) { - /* Move data pointer back by one sample to start from right - * channel sample. Discard read value p0. + /* Move data pointer back by one sample to start from right + * channel sample. Discard read value p0. + */ + dp = (ae_f32x2 *)rp; + AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_f16x4 *)cp; + a0 = AE_ZERO64(); + a1 = AE_ZERO64(); + u = AE_LA64_PP(coefp); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Four coefficients + * are loaded simultaneously. Data is read + * from interleaved buffer with stride of channels + * count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load four coefficients */ + AE_LA16X4_IP(coef4, u, coefp); + + /* Load two data samples from two channels */ + AE_L32X2_XC(d0, dp, inc); /* r0, l0 */ + AE_L32X2_XC(d1, dp, inc); /* r1, l1 */ + + /* Select to data2 sequential samples from a channel + * and then accumulate to a0 and a1 + * data2_h * coef4_3 + data2_l * coef4_2. + * The data is 32 bits Q1.31 and coefficient 16 bits + * Q1.15. The accumulators are Q17.47. */ - dp = (ae_f32x2 *)rp; - AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32)); + data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */ + AE_MULAAFD32X16_H3_L2(a0, data2, coef4); + data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */ + AE_MULAAFD32X16_H3_L2(a1, data2, coef4); - /* Reset coefficient pointer and clear accumulator */ - coefp = (ae_f16x4 *)cp; - a0 = AE_ZERO64(); - a1 = AE_ZERO64(); + /* Load two data samples from two channels */ + AE_L32X2_XC(d0, dp, inc); /* r2, l2 */ + AE_L32X2_XC(d1, dp, inc); /* r3, l3 */ - /* Compute FIR filter for current channel with four - * taps per every loop iteration. Four coefficients - * are loaded simultaneously. Data is read - * from interleaved buffer with stride of channels - * count. + /* Accumulate + * data2_h * coef4_1 + data2_l * coef4_0. */ - for (i = 0; i < taps_div_4; i++) { - /* Load four coefficients */ - AE_LA16X4_IP(coef4, u, coefp); - - /* Load two data samples from two channels */ - AE_L64_XC(d0, dp, inc); /* r0, l0 */ - AE_L64_XC(d1, dp, inc); /* r1, l1 */ - - /* Select to data2 sequential samples from a channel - * and then accumulate to a0 and a1 - * data2_h * coef4_3 + data2_l * coef4_2. - * The data is 32 bits Q1.31 and coefficient 16 bits - * Q1.15. The accumulators are Q17.47. - */ - data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */ - AE_MULAAFD32X16_H3_L2(a0, data2, coef4); - data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */ - AE_MULAAFD32X16_H3_L2(a1, data2, coef4); - - /* Load two data samples from two channels */ - AE_L64_XC(d0, dp, inc); /* r2, l2 */ - AE_L64_XC(d1, dp, inc); /* r3, l3 */ - - /* Accumulate - * data2_h * coef4_1 + data2_l * coef4_0. - */ - data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */ - AE_MULAAFD32X16_H1_L0(a0, data2, coef4); - data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */ - AE_MULAAFD32X16_H1_L0(a1, data2, coef4); - } + data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */ + AE_MULAAFD32X16_H1_L0(a0, data2, coef4); + data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */ + AE_MULAAFD32X16_H1_L0(a1, data2, coef4); + } - /* Scale FIR output with right shifts, round/saturate - * to Q1.31, and store 32 bit output. - */ - AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, - sizeof(int32_t)); - AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, - sizeof(int32_t)); + /* Scale FIR output with right shifts, round/saturate + * to Q1.31, and store 32 bit output. + */ + AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, sizeof(int32_t)); + AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, sizeof(int32_t)); +} - return; - } +static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, + const int taps_div_4, const int shift, + const int nch) +{ + /* This function uses + * 6x 64 bit registers + * 3x integers + * 3x address pointers, + */ + ae_f64 a0; + ae_valign u; + ae_f16x4 coef4; + ae_f32x2 d0; + ae_f32x2 d1; + ae_f32x2 data2; + ae_f16x4 *coefp; + ae_f32 *dp0; + ae_f32 *dp1; + int i; + int j; + ae_f32 *wp = wp0; + const int inc = nch * sizeof(int32_t); dp1 = (ae_f32 *)rp; for (j = 0; j < nch; j++) { @@ -124,6 +142,7 @@ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, /* Reset coefficient pointer and clear accumulator */ coefp = (ae_f16x4 *)cp; a0 = AE_ZERO64(); + u = AE_LA64_PP(coefp); /* Compute FIR filter for current channel with four * taps per every loop iteration. Data is read from @@ -169,139 +188,144 @@ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, #else /* 32bit coefficients version */ -static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, - const int taps_div_4, const int shift, - const int nch) +static inline void fir_filter_2ch(ae_f32 *rp, const void *cp, ae_f32 *wp0, + const int taps_div_4, const int shift) { - /* This function uses - * 6x 64 bit registers - * 3x integers - * 5x address pointers, - */ - ae_f64 a0; - ae_f64 a1; - ae_f24x2 data2 = AE_ZERO24(); - ae_f24x2 coef2 = AE_ZERO24(); - ae_f24x2 d0 = AE_ZERO24(); - ae_f24x2 d1 = AE_ZERO24(); - ae_f24x2 *coefp; - ae_f24x2 *dp; - ae_f24 *dp1; - ae_f24 *dp0; - int i; - int j; + ae_f64 a0 = AE_ZERO64(); + ae_f64 a1 = AE_ZERO64(); + ae_valign coef_align; + ae_f32x2 coef32; + ae_f32x2 coef10; + ae_f32x2 sample10; + ae_f32x2 f0; + ae_f32x2 f1; + ae_int32x2 *coefp; + ae_f32x2 *dp; ae_f32 *wp = wp0; - const int inc = nch * sizeof(int32_t); + const int inc = 2 * sizeof(int32_t); + int i; - if (nch == 2) { - /* Move data pointer back by one sample to start from right - * channel sample. Discard read value p0. + /* Move data pointer back by one sample to start from right + * channel sample. Discard read value p0. + */ + dp = (ae_f32x2 *)rp; + AE_L32_XC(f0, (ae_f32 *)dp, -sizeof(ae_f32)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_int32x2 *)cp; + coef_align = AE_LA64_PP(coefp); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Two coefficients + * are loaded simultaneously. Data is read + * from interleaved buffer with stride of channels + * count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load four coefficients */ + AE_LA32X2_IP(coef10, coef_align, coefp); + AE_LA32X2_IP(coef32, coef_align, coefp); + + /* Load two data samples from two channels. Note: Due to + * polyphase array data start shift for sub-filters can't + * use 128 bits load due to align requirement. */ - dp = (ae_f24x2 *)rp; - AE_L32F24_XC(d0, (ae_f24 *)dp, -sizeof(ae_f24)); + AE_L32X2_XC(f0, dp, inc); /* f0.h is left0, f0.l is right0 */ + AE_L32X2_XC(f1, dp, inc); /* f1.h is left1, f1.l is right1 */ - /* Reset coefficient pointer and clear accumulator */ - coefp = (ae_f24x2 *)cp; - a0 = AE_ZERO64(); - a1 = AE_ZERO64(); - - /* Compute FIR filter for current channel with four - * taps per every loop iteration. Two coefficients - * are loaded simultaneously. Data is read - * from interleaved buffer with stride of channels - * count. + /* a0 (left) += left10.h (left1) * coef10.h (coef2) + * += left10.l (left0) * coef10.l (coef1) + * a1 (right) += right10.h (right1) * coef10.h (coef2) + * += right10.l (right0) * coef10.l (coef1) */ - for (i = 0; i < taps_div_4; i++) { - /* Load two coefficients. Coef2_h contains tap *coefp - * and coef2_l contains the next tap. - */ - /* TODO: Ensure coefficients are 64 bits aligned */ - AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2)); - - /* Load two data samples from two channels */ - AE_L32X2F24_XC(d0, dp, inc); /* r0, l0 */ - AE_L32X2F24_XC(d1, dp, inc); /* r1, l1 */ - - /* Select to data2 two successive left channel samples - * from d0 and d1, multiply-add and accumulate to a0. - * Select to data2 two successive right channel samples - * from d0 and d1, multiply-add and accumulate to a1. - * data2_h * coef2_h + data2_l * coef2_l. The Q1.31 - * data and Q1.15 coefficients are used as 24 bits as - * Q1.23 values. - */ - data2 = AE_SELP24_LL(d0, d1); - AE_MULAAFP24S_HH_LL(a0, data2, coef2); - data2 = AE_SELP24_HH(d0, d1); - AE_MULAAFP24S_HH_LL(a1, data2, coef2); - - /* Repeat for next two taps */ - AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2)); - AE_L32X2F24_XC(d0, dp, inc); /* r2, l2 */ - AE_L32X2F24_XC(d1, dp, inc); /* r3, l3 */ - data2 = AE_SELP24_LL(d0, d1); - AE_MULAAFP24S_HH_LL(a0, data2, coef2); - data2 = AE_SELP24_HH(d0, d1); - AE_MULAAFP24S_HH_LL(a1, data2, coef2); - } + sample10 = AE_SEL32_HH(f0, f1); + AE_MULAAFD32RA_HH_LL(a0, sample10, coef10); + sample10 = AE_SEL32_LL(f0, f1); + AE_MULAAFD32RA_HH_LL(a1, sample10, coef10); + + /* Repeat for next two taps */ + AE_L32X2_XC(f0, dp, inc); /* f0.h is left2, f0.l is right2 */ + AE_L32X2_XC(f1, dp, inc); /* f1.h is left3, f1.l is right3 */ + sample10 = AE_SEL32_HH(f0, f1); + AE_MULAAFD32RA_HH_LL(a0, sample10, coef32); + sample10 = AE_SEL32_LL(f0, f1); + AE_MULAAFD32RA_HH_LL(a1, sample10, coef32); + } - /* Scale FIR output with right shifts, round/saturate - * to Q1.31, and store 32 bit output. - */ - AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, - sizeof(int32_t)); - AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, - sizeof(int32_t)); + /* Scale FIR output with right shifts, round/saturate + * to Q1.31, and store 32 bit output. + */ + f0 = AE_ROUND32X2F48SASYM(AE_SRAA64(a1, shift), AE_SRAA64(a0, shift)); + AE_S32X2_I(f0, (ae_int32x2 *)wp, 0); +} - return; - } +static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, + const int taps_div_4, const int shift, + const int nch) +{ + ae_f64 a0 = AE_ZERO64(); + ae_valign coef_align; + ae_f32x2 coef32; + ae_f32x2 coef10; + ae_f32x2 sample10; + ae_f32x2 f0; + ae_f32x2 f1; + ae_int32x2 *coefp; + ae_f32 *dp; + ae_f32 *dp1 = rp; + ae_f32 *wp = wp0; + const int inc = nch * sizeof(int32_t); + int i; + int j; - dp1 = (ae_f24 *)rp; for (j = 0; j < nch; j++) { /* Copy pointer and advance to next ch with dummy load */ - dp0 = dp1; - AE_L32F24_XC(data2, dp1, -sizeof(ae_f24)); + dp = dp1; + AE_L32_XC(f0, dp1, -sizeof(ae_f32)); /* Reset coefficient pointer and clear accumulator */ - coefp = (ae_f24x2 *)cp; + coefp = (ae_int32x2 *)cp; a0 = AE_ZERO64(); + coef_align = AE_LA64_PP(coefp); /* Compute FIR filter for current channel with four * taps per every loop iteration. Data is read from * interleaved buffer with stride of channels count. */ for (i = 0; i < taps_div_4; i++) { - /* Load two coefficients */ - coef2 = *coefp++; + /* Load four coefficients + * TODO: Ensure coefficients are 128 bits aligned + */ + AE_LA32X2_IP(coef10, coef_align, coefp); + AE_LA32X2_IP(coef32, coef_align, coefp); /* Load two data samples, place to high and * low of data2. */ - AE_L32F24_XC(d0, dp0, inc); - AE_L32F24_XC(d1, dp0, inc); - data2 = AE_SELP24_LL(d0, d1); + AE_L32_XC(f0, dp, inc); + AE_L32_XC(f1, dp, inc); + sample10 = AE_SEL32_LL(f0, f1); /* Accumulate to data2_h * coef2_h + * data2_l*coef2_l. The Q1.31 bit data is used * as Q1.23 from MSB side bits of the 32 bit * word. The accumulator m is Q17.47. */ - AE_MULAAFD24_HH_LL(a0, data2, coef2); + AE_MULAAFD32RA_HH_LL(a0, sample10, coef10); /* Repeat the same for next two filter taps */ - coef2 = *coefp++; - AE_L32F24_XC(d0, dp0, inc); - AE_L32F24_XC(d1, dp0, inc); - data2 = AE_SELP24_LL(d0, d1); - AE_MULAAFD24_HH_LL(a0, data2, coef2); + AE_L32_XC(f0, dp, inc); + AE_L32_XC(f1, dp, inc); + sample10 = AE_SEL32_LL(f0, f1); + AE_MULAAFD32RA_HH_LL(a0, sample10, coef32); } /* Scale FIR output with right shifts, round/saturate Q17.47 * to Q1.31, and store 32 bit output. Advance write * pointer to next sample. */ - AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, - sizeof(int32_t)); + f0 = AE_ROUND32F48SASYM(AE_SRAA64(a0, shift)); + AE_S32_L_XP(f0, wp, sizeof(int32_t)); } } @@ -315,7 +339,7 @@ void src_polyphase_stage_cir(struct src_stage_prm *s) * 16x integers * 11x address pointers, */ - ae_int32x2 q = AE_ZERO32(); + ae_int32x2 q; ae_f32 *rp; ae_f32 *wp; int i; @@ -342,12 +366,7 @@ void src_polyphase_stage_cir(struct src_stage_prm *s) int32_t *y_wptr = (int32_t *)s->y_wptr; int32_t *x_end_addr = (int32_t *)s->x_end_addr; int32_t *y_end_addr = (int32_t *)s->y_end_addr; - -#if SRC_SHORT - const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); -#else - const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); -#endif + const size_t subfilter_size = cfg->subfilter_length * SRC_COEF_SIZE; for (n = 0; n < s->times; n++) { /* Input data to filter */ @@ -388,17 +407,25 @@ void src_polyphase_stage_cir(struct src_stage_prm *s) * sub-filters. */ wp = (ae_f32 *)fir->out_rp; - for (i = 0; i < cfg->num_of_subfilters; i++) { - fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); - wp += nch_x_odm; - cp = (uint8_t *)cp + subfilter_size; - src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); - - /* Circular advance pointer rp by number of - * channels x input delay multiplier. Loaded value q - * is discarded. - */ - AE_L32_XC(q, rp, nch_x_idm_sz); + if (nch == 2) { + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter_2ch(rp, cp, wp, taps_div_4, cfg->shift); + wp += nch_x_odm; + cp = (uint8_t *)cp + subfilter_size; + src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); + /* Circular advance pointer rp by number of channels x input + * delay multiplier. Loaded value q is discarded. + */ + AE_L32_XC(q, rp, nch_x_idm_sz); + } + } else { + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); + wp += nch_x_odm; + cp = (uint8_t *)cp + subfilter_size; + src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); + AE_L32_XC(q, rp, nch_x_idm_sz); + } } /* Output */ @@ -464,12 +491,7 @@ void src_polyphase_stage_cir_s16(struct src_stage_prm *s) int16_t *y_wptr = (int16_t *)s->y_wptr; int16_t *x_end_addr = (int16_t *)s->x_end_addr; int16_t *y_end_addr = (int16_t *)s->y_end_addr; - -#if SRC_SHORT - const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); -#else - const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); -#endif + const size_t subfilter_size = cfg->subfilter_length * SRC_COEF_SIZE; for (n = 0; n < s->times; n++) { /* Input data */ @@ -509,17 +531,25 @@ void src_polyphase_stage_cir_s16(struct src_stage_prm *s) * sub-filters. */ wp = (ae_f32 *)fir->out_rp; - for (i = 0; i < cfg->num_of_subfilters; i++) { - fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); - wp += nch_x_odm; - cp = (uint8_t *)cp + subfilter_size; - src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); - - /* Circular advance pointer rp by number of - * channels x input delay multiplier. Loaded value q - * is discarded. - */ - AE_L32_XC(q, rp, nch_x_idm_sz); + if (nch == 2) { + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter_2ch(rp, cp, wp, taps_div_4, cfg->shift); + wp += nch_x_odm; + cp = (uint8_t *)cp + subfilter_size; + src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); + /* Circular advance pointer rp by number of channels x input delay + * multiplier. Loaded value q is discarded. + */ + AE_L32_XC(q, rp, nch_x_idm_sz); + } + } else { + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); + wp += nch_x_odm; + cp = (uint8_t *)cp + subfilter_size; + src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); + AE_L32_XC(q, rp, nch_x_idm_sz); + } } /* Output */ diff --git a/src/audio/src/src_hifi5.c b/src/audio/src/src_hifi5.c new file mode 100644 index 000000000000..0adfc4796d25 --- /dev/null +++ b/src/audio/src/src_hifi5.c @@ -0,0 +1,583 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2022-2025 Intel Corporation. +// +// Author: Seppo Ingalsuo +// Krzysztof Frydryk + +/* HiFi5 optimized code parts for SRC */ + +#include "src_config.h" + +#if SRC_HIFI5 + +#include "src_common.h" + +#include +#include +#include +#include +#include + +/* sof/math/numbers.h doesn't define MIN when used with zephyr */ +#ifdef __ZEPHYR__ +#include +#endif /* __ZEPHYR__ */ + +#if SRC_SHORT +#define SRC_COEF_SIZE sizeof(int16_t) +#else +#define SRC_COEF_SIZE sizeof(int32_t) +#endif + +#if SRC_SHORT /* 16 bit coefficients version */ + +static inline void fir_filter_2ch(ae_f32 *rp, const void *cp, ae_f32 *wp0, + const int taps_div_4, const int shift) +{ + /* This function uses + * 7x 64 bit registers + * 2x integers + * 3x address pointers, + */ + ae_f64 a0; + ae_f64 a1; + ae_valign u; + ae_f16x4 coef4; + ae_f32x2 d0; + ae_f32x2 d1; + ae_f32x2 data2; + ae_f16x4 *coefp; + ae_f32x2 *dp; + int i; + ae_f32 *wp = wp0; + const int inc = 2 * sizeof(int32_t); + + /* Move data pointer back by one sample to start from right + * channel sample. Discard read value p0. + */ + dp = (ae_f32x2 *)rp; + AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_f16x4 *)cp; + a0 = AE_ZERO64(); + a1 = AE_ZERO64(); + u = AE_LA64_PP(coefp); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Four coefficients + * are loaded simultaneously. Data is read + * from interleaved buffer with stride of channels + * count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load four coefficients */ + AE_LA16X4_IP(coef4, u, coefp); + + /* Load two data samples from two channels */ + AE_L32X2_XC(d0, dp, inc); /* r0, l0 */ + AE_L32X2_XC(d1, dp, inc); /* r1, l1 */ + + /* Select to data2 sequential samples from a channel + * and then accumulate to a0 and a1 + * data2_h * coef4_3 + data2_l * coef4_2. + * The data is 32 bits Q1.31 and coefficient 16 bits + * Q1.15. The accumulators are Q17.47. + */ + data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */ + AE_MULAAFD32X16_H3_L2(a0, data2, coef4); + data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */ + AE_MULAAFD32X16_H3_L2(a1, data2, coef4); + + /* Load two data samples from two channels */ + AE_L32X2_XC(d0, dp, inc); /* r2, l2 */ + AE_L32X2_XC(d1, dp, inc); /* r3, l3 */ + + /* Accumulate + * data2_h * coef4_1 + data2_l * coef4_0. + */ + data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */ + AE_MULAAFD32X16_H1_L0(a0, data2, coef4); + data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */ + AE_MULAAFD32X16_H1_L0(a1, data2, coef4); + } + + /* Scale FIR output with right shifts, round/saturate + * to Q1.31, and store 32 bit output. + */ + AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, sizeof(int32_t)); + AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, sizeof(int32_t)); +} + +static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, + const int taps_div_4, const int shift, + const int nch) +{ + /* This function uses + * 6x 64 bit registers + * 3x integers + * 3x address pointers, + */ + ae_f64 a0; + ae_valign u; + ae_f16x4 coef4; + ae_f32x2 d0; + ae_f32x2 d1; + ae_f32x2 data2; + ae_f16x4 *coefp; + ae_f32 *dp0; + ae_f32 *dp1; + int i; + int j; + ae_f32 *wp = wp0; + const int inc = nch * sizeof(int32_t); + + dp1 = (ae_f32 *)rp; + for (j = 0; j < nch; j++) { + /* Copy pointer and advance to next ch with dummy load */ + dp0 = dp1; + AE_L32_XC(d0, dp1, -sizeof(ae_f32)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_f16x4 *)cp; + a0 = AE_ZERO64(); + u = AE_LA64_PP(coefp); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Data is read from + * interleaved buffer with stride of channels count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load four coefficients */ + AE_LA16X4_IP(coef4, u, coefp); + + /* Load two data samples, place to high and + * low of data2. + */ + AE_L32_XC(d0, dp0, inc); + AE_L32_XC(d1, dp0, inc); + data2 = AE_SEL32_LL(d0, d1); + + /* Accumulate + * data2_h * coef4_3 + data2_l* coef4_2. + * The data is 32 bits Q1.31 and coefficient 16 bits + * Q1.15. The accumulator is Q17.47. + */ + AE_MULAAFD32X16_H3_L2(a0, data2, coef4); + + /* Repeat with next two samples */ + AE_L32_XC(d0, dp0, inc); + AE_L32_XC(d1, dp0, inc); + data2 = AE_SEL32_LL(d0, d1); + + /* Accumulate + * data2_h * coef4_1 + data2_l * coef4_0. + */ + AE_MULAAFD32X16_H1_L0(a0, data2, coef4); + } + + /* Scale FIR output with right shifts, round/saturate Q17.47 + * to Q1.31, and store 32 bit output. Advance write + * pointer to next sample. + */ + AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, + sizeof(int32_t)); + } +} + +#else /* 32bit coefficients version */ + +static inline void fir_filter_2ch(ae_f32 *rp, const void *cp, ae_f32 *wp0, + const int taps_div_4, const int shift) +{ + ae_valignx2 coef_align; + ae_f64 a0 = AE_ZERO64(); + ae_f64 a1 = AE_ZERO64(); + ae_f32x2 coef32; + ae_f32x2 coef10; + ae_f32x2 left10; + ae_f32x2 right10; + ae_f32x2 f0; + ae_f32x2 f1; + ae_int32x4 *coefp; + ae_f32x2 *dp; + ae_f32 *wp = wp0; + const int inc = 2 * sizeof(int32_t); + int i; + + /* Move data pointer back by one sample to start from right + * channel sample. Discard read value p0. + */ + dp = (ae_f32x2 *)rp; + AE_L32_XC(f0, (ae_f32 *)dp, -sizeof(ae_f32)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_int32x4 *)cp; + coef_align = AE_LA128_PP(coefp); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Two coefficients + * are loaded simultaneously. Data is read + * from interleaved buffer with stride of channels + * count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load four coefficients */ + AE_LA32X2X2_IP(coef10, coef32, coef_align, coefp); + + /* Load two data samples from two channels. Note: Due to + * polyphase array data start shift for sub-filters can't + * use 128 bits load due to align requirement. + */ + AE_L32X2_XC(f0, dp, inc); /* f0.h is left0, f0.l is right0 */ + AE_L32X2_XC(f1, dp, inc); /* f1.h is left1, f1.l is right1 */ + + /* a0 (left) += left10.h (left1) * coef10.h (coef2) + * += left10.l (left0) * coef10.l (coef1) + * a1 (right) += right10.h (right1) * coef10.h (coef2) + * += right10.l (right0) * coef10.l (coef1) + */ + left10 = AE_SEL32_HH(f0, f1); + right10 = AE_SEL32_LL(f0, f1); + AE_MULAAF2D32RA_HH_LL(a0, a1, left10, right10, coef10, coef10); + + /* Repeat for next two taps */ + AE_L32X2_XC(f0, dp, inc); /* f0.h is left2, f0.l is right2 */ + AE_L32X2_XC(f1, dp, inc); /* f1.h is left3, f1.l is right3 */ + left10 = AE_SEL32_HH(f0, f1); + right10 = AE_SEL32_LL(f0, f1); + AE_MULAAF2D32RA_HH_LL(a0, a1, left10, right10, coef32, coef32); + } + + /* Scale FIR output with right shifts, round/saturate + * to Q1.31, and store 32 bit output. + */ + f0 = AE_ROUND32X2F48SASYM(AE_SRAA64(a1, shift), AE_SRAA64(a0, shift)); + AE_S32X2_I(f0, (ae_int32x2 *)wp, 0); +} + +static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, + const int taps_div_4, const int shift, + const int nch) +{ + ae_valignx2 coef_align; + ae_f64 a0 = AE_ZERO64(); + ae_f32x2 coef32; + ae_f32x2 coef10; + ae_f32x2 sample10; + ae_f32x2 f0; + ae_f32x2 f1; + ae_int32x4 *coefp; + ae_f32 *dp; + ae_f32 *dp1 = rp; + ae_f32 *wp = wp0; + const int inc = nch * sizeof(int32_t); + int i; + int j; + + for (j = 0; j < nch; j++) { + /* Copy pointer and advance to next ch with dummy load */ + dp = dp1; + AE_L32_XC(f0, dp1, -sizeof(ae_f32)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_int32x4 *)cp; + a0 = AE_ZERO64(); + coef_align = AE_LA128_PP(coefp); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Data is read from + * interleaved buffer with stride of channels count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load four coefficients + * TODO: Ensure coefficients are 128 bits aligned + */ + AE_LA32X2X2_IP(coef10, coef32, coef_align, coefp); + + /* Load two data samples, place to high and + * low of data2. + */ + AE_L32_XC(f0, dp, inc); + AE_L32_XC(f1, dp, inc); + sample10 = AE_SEL32_LL(f0, f1); + + /* Accumulate to data2_h * coef2_h + + * data2_l*coef2_l. The Q1.31 bit data is used + * as Q1.23 from MSB side bits of the 32 bit + * word. The accumulator m is Q17.47. + */ + AE_MULAAFD32RA_HH_LL(a0, sample10, coef10); + + /* Repeat the same for next two filter taps */ + AE_L32_XC(f0, dp, inc); + AE_L32_XC(f1, dp, inc); + sample10 = AE_SEL32_LL(f0, f1); + AE_MULAAFD32RA_HH_LL(a0, sample10, coef32); + } + + /* Scale FIR output with right shifts, round/saturate Q17.47 + * to Q1.31, and store 32 bit output. Advance write + * pointer to next sample. + */ + f0 = AE_ROUND32F48SASYM(AE_SRAA64(a0, shift)); + AE_S32_L_XP(f0, wp, sizeof(int32_t)); + } +} + +#endif /* 32bit coefficients version */ + +#if CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE +void src_polyphase_stage_cir(struct src_stage_prm *s) +{ + /* This function uses + * 1x 64 bit registers + * 16x integers + * 11x address pointers, + */ + ae_int32x2 q; + ae_f32 *rp; + ae_f32 *wp; + int i; + int n; + int m; + int n_wrap_buf; + int n_min; + struct src_state *fir = s->state; + const struct src_stage *cfg = s->stage; + int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; + int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; + const void *cp; /* Can be int32_t or int16_t */ + const size_t out_size = fir->out_delay_size * sizeof(int32_t); + const int nch = s->nch; + const int nch_x_odm = cfg->odm * nch; + const int blk_in_words = nch * cfg->blk_in; + const int blk_out_words = nch * cfg->num_of_subfilters; + const int sz = sizeof(int32_t); + const int n_sz = -sizeof(int32_t); + const int rewind_sz = sz * nch * (cfg->blk_in + (cfg->num_of_subfilters - 1) * cfg->idm); + const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t); + const int taps_div_4 = cfg->subfilter_length >> 2; + int32_t *x_rptr = (int32_t *)s->x_rptr; + int32_t *y_wptr = (int32_t *)s->y_wptr; + int32_t *x_end_addr = (int32_t *)s->x_end_addr; + int32_t *y_end_addr = (int32_t *)s->y_end_addr; + const size_t subfilter_size = cfg->subfilter_length * SRC_COEF_SIZE; + + for (n = 0; n < s->times; n++) { + /* Input data to filter */ + + /* Setup circular buffer for FIR input data delay */ + AE_SETCBEGIN0(fir->fir_delay); + AE_SETCEND0(fir_end); + + for (m = blk_in_words; m > 0; m -= n_min) { + /* Number of words until circular wrap */ + n_wrap_buf = x_end_addr - x_rptr; + n_min = MIN(m, n_wrap_buf); + for (i = 0; i < n_min; i++) { + /* Load 32 bits sample to accumulator, + * advance pointer, shift left with saturate. + */ + AE_L32_XP(q, (ae_int32 *)x_rptr, sz); + q = AE_SLAA32(q, s->shift); + + /* Store to circular buffer, advance pointer */ + AE_S32_L_XC(q, (ae_int32 *)fir->fir_wp, n_sz); + } + + /* Check for wrap */ + src_inc_wrap(&x_rptr, x_end_addr, s->x_size); + } + + /* Do filter */ + cp = cfg->coefs; /* Reset to 1st coefficient */ + rp = (ae_f32 *)fir->fir_wp; + + /* Do circular modification to pointer rp by amount of + * rewind to data start. Loaded value q is discarded. + */ + AE_L32_XC(q, rp, rewind_sz); + + /* Reset FIR write pointer and compute all polyphase + * sub-filters. + */ + wp = (ae_f32 *)fir->out_rp; + if (nch == 2) { + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter_2ch(rp, cp, wp, taps_div_4, cfg->shift); + wp += nch_x_odm; + cp = (uint8_t *)cp + subfilter_size; + src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); + /* Circular advance pointer rp by number of channels x input + * delay multiplier. Loaded value q is discarded. + */ + AE_L32_XC(q, rp, nch_x_idm_sz); + } + } else { + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); + wp += nch_x_odm; + cp = (uint8_t *)cp + subfilter_size; + src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); + AE_L32_XC(q, rp, nch_x_idm_sz); + } + } + + /* Output */ + + /* Setup circular buffer for SRC out delay access */ + AE_SETCBEGIN0(fir->out_delay); + AE_SETCEND0(out_delay_end); + for (m = blk_out_words; m > 0; m -= n_min) { + n_wrap_buf = y_end_addr - y_wptr; + n_min = MIN(m, n_wrap_buf); + for (i = 0; i < n_min; i++) { + /* Circular load, shift right, linear store, + * and advance read and write pointers. + */ + AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz); + q = AE_SRAA32(q, s->shift); + AE_S32_L_XP(q, (ae_int32 *)y_wptr, sz); + } + + /* Check wrap */ + src_inc_wrap(&y_wptr, y_end_addr, s->y_size); + } + } + s->x_rptr = x_rptr; + s->y_wptr = y_wptr; +} +#endif /* CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE */ + +#if CONFIG_FORMAT_S16LE +void src_polyphase_stage_cir_s16(struct src_stage_prm *s) +{ + /* This function uses + * 2x 64 bit registers + * 16x integers + * 11x address pointers, + */ + ae_int32x2 q = AE_ZERO32(); + ae_int16x4 d = AE_ZERO16(); + ae_f32 *rp; + ae_f32 *wp; + int i; + int n; + int m; + int n_wrap_buf; + int n_min; + + struct src_state *fir = s->state; + const struct src_stage *cfg = s->stage; + int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; + int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; + const void *cp; /* Can be int32_t or int16_t */ + const size_t out_size = fir->out_delay_size * sizeof(int32_t); + const int nch = s->nch; + const int nch_x_odm = cfg->odm * nch; + const int blk_in_words = nch * cfg->blk_in; + const int blk_out_words = nch * cfg->num_of_subfilters; + const int sz = sizeof(int32_t); + const int n_sz = -sizeof(int32_t); + const int rewind_sz = sz * nch * (cfg->blk_in + (cfg->num_of_subfilters - 1) * cfg->idm); + const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t); + const int taps_div_4 = cfg->subfilter_length >> 2; + int16_t *x_rptr = (int16_t *)s->x_rptr; + int16_t *y_wptr = (int16_t *)s->y_wptr; + int16_t *x_end_addr = (int16_t *)s->x_end_addr; + int16_t *y_end_addr = (int16_t *)s->y_end_addr; + const size_t subfilter_size = cfg->subfilter_length * SRC_COEF_SIZE; + + for (n = 0; n < s->times; n++) { + /* Input data */ + + /* Setup circular buffer for FIR input data delay */ + AE_SETCBEGIN0(fir->fir_delay); + AE_SETCEND0(fir_end); + for (m = blk_in_words; m > 0; m -= n_min) { + /* Number of words without circular wrap */ + n_wrap_buf = x_end_addr - x_rptr; + n_min = (m < n_wrap_buf) ? m : n_wrap_buf; + for (i = 0; i < n_min; i++) { + /* Load a 16 bits sample into d and left shift + * by 16 into q, advance read and write + * pointers. + */ + AE_L16_XP(d, (ae_int16 *)x_rptr, + sizeof(ae_int16)); + q = AE_CVT32X2F16_32(d); + AE_S32_L_XC(q, (ae_int32 *)fir->fir_wp, n_sz); + } + + /* Check for wrap */ + src_inc_wrap_s16(&x_rptr, x_end_addr, s->x_size); + } + + /* Do filter */ + cp = cfg->coefs; /* Reset to 1st coefficient */ + rp = (ae_f32 *)fir->fir_wp; + + /* Do circular modification to pointer rp by amount of + * rewind to data start. Loaded value q is discarded. + */ + AE_L32_XC(q, rp, rewind_sz); + + /* Reset FIR output write pointer and compute all polyphase + * sub-filters. + */ + wp = (ae_f32 *)fir->out_rp; + if (nch == 2) { + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter_2ch(rp, cp, wp, taps_div_4, cfg->shift); + wp += nch_x_odm; + cp = (uint8_t *)cp + subfilter_size; + src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); + /* Circular advance pointer rp by number of channels x input delay + * multiplier. Loaded value q is discarded. + */ + AE_L32_XC(q, rp, nch_x_idm_sz); + } + } else { + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); + wp += nch_x_odm; + cp = (uint8_t *)cp + subfilter_size; + src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); + AE_L32_XC(q, rp, nch_x_idm_sz); + } + } + + /* Output */ + + /* Setup circular buffer for SRC out delay access */ + AE_SETCBEGIN0(fir->out_delay); + AE_SETCEND0(out_delay_end); + for (m = blk_out_words; m > 0; m -= n_min) { + n_wrap_buf = y_end_addr - y_wptr; + n_min = (m < n_wrap_buf) ? m : n_wrap_buf; + for (i = 0; i < n_min; i++) { + /* Circular load for 32 bit sample, + * advance read pointer. + */ + AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz); + + /* Store Q1.31 value as Q1.15 and + * advance write pointer. + */ + d = AE_ROUND16X4F32SSYM(q, q); + AE_S16_0_XP(d, (ae_int16 *)y_wptr, + sizeof(ae_int16)); + } + + /* Check wrap */ + src_inc_wrap_s16(&y_wptr, y_end_addr, s->y_size); + } + } + s->x_rptr = x_rptr; + s->y_wptr = y_wptr; +} +#endif /* CONFIG_FORMAT_S16LE */ + +#endif diff --git a/zephyr/CMakeLists.txt b/zephyr/CMakeLists.txt index 4c2ffbc3db4d..77050ac54c20 100644 --- a/zephyr/CMakeLists.txt +++ b/zephyr/CMakeLists.txt @@ -867,6 +867,7 @@ elseif(CONFIG_COMP_SRC) ${SOF_AUDIO_PATH}/src/src_generic.c ${SOF_AUDIO_PATH}/src/src_hifi3.c ${SOF_AUDIO_PATH}/src/src_hifi4.c + ${SOF_AUDIO_PATH}/src/src_hifi5.c ${SOF_AUDIO_PATH}/src/src_common.c ${SOF_AUDIO_PATH}/src/src.c ${SOF_AUDIO_PATH}/src/src_${ipc_suffix}.c