From cdb7dd0aa0c3322689a7234fa1187c616b21c90f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Trung=20L=C3=AA?= <8@tle.id.au> Date: Thu, 19 Mar 2026 16:18:04 +1100 Subject: [PATCH] Add native wrapper for libmvec.so.1 (_ZGVbN2v_sin, _ZGVbN2v_cos) Wrap the SSE2 vectorized sin/cos functions from glibc libmvec using native scalar sin()/cos() from libm. Since libmvec.so.1 does not exist as a native library on ppc64le, PRE_INIT uses dlopen(NULL) to obtain a valid library handle. All other ~270 libmvec symbols are listed as commented-out entries in the private header and fall back to emulation. --- CMakeLists.txt | 1 + src/library_list.h | 1 + src/wrapped/generated/functions_list.txt | 4 + src/wrapped/generated/wrappedlibmvecdefs.h | 8 + src/wrapped/generated/wrappedlibmvectypes.h | 20 ++ src/wrapped/generated/wrappedlibmvecundefs.h | 8 + src/wrapped/wrappedlibmvec.c | 86 +++++++ src/wrapped/wrappedlibmvec_private.h | 235 +++++++++++++++++++ 8 files changed, 363 insertions(+) create mode 100644 src/wrapped/generated/wrappedlibmvecdefs.h create mode 100644 src/wrapped/generated/wrappedlibmvectypes.h create mode 100644 src/wrapped/generated/wrappedlibmvecundefs.h create mode 100644 src/wrapped/wrappedlibmvec.c create mode 100644 src/wrapped/wrappedlibmvec_private.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 8965687ca9..590db48443 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -668,6 +668,7 @@ set(WRAPPEDS "${BOX64_ROOT}/src/wrapped/wrappedlibice.c" "${BOX64_ROOT}/src/wrapped/wrappedliblapack.c" "${BOX64_ROOT}/src/wrapped/wrappedlibm.c" + "${BOX64_ROOT}/src/wrapped/wrappedlibmvec.c" "${BOX64_ROOT}/src/wrapped/wrappedlibncurses.c" "${BOX64_ROOT}/src/wrapped/wrappedlibncurses6.c" "${BOX64_ROOT}/src/wrapped/wrappedlibncursesw.c" diff --git a/src/library_list.h b/src/library_list.h index 80d7795101..5ffce98736 100644 --- a/src/library_list.h +++ b/src/library_list.h @@ -406,6 +406,7 @@ GO("libandroid-shmem.so", androidshmem) GO("libc.so.6", libc) GO("libm.so.6", libm) +GO("libmvec.so.1", libmvec) GO("libdl.so.2", libdl) GO("libpthread.so.0", libpthread) GO("libcups.so.2", libcups) diff --git a/src/wrapped/generated/functions_list.txt b/src/wrapped/generated/functions_list.txt index 4edaa8aa51..3c7526a7e2 100644 --- a/src/wrapped/generated/functions_list.txt +++ b/src/wrapped/generated/functions_list.txt @@ -5992,6 +5992,10 @@ wrappedlibm: - __pow_finite - DFDD: - __powl_finite +wrappedlibmvec: +- vFv: + - _ZGVbN2v_cos + - _ZGVbN2v_sin wrappedlibncurses: - pFv: - initscr diff --git a/src/wrapped/generated/wrappedlibmvecdefs.h b/src/wrapped/generated/wrappedlibmvecdefs.h new file mode 100644 index 0000000000..6bfaa1f0fb --- /dev/null +++ b/src/wrapped/generated/wrappedlibmvecdefs.h @@ -0,0 +1,8 @@ +/******************************************************************* + * File automatically generated by rebuild_wrappers.py (v2.5.0.24) * + *******************************************************************/ +#ifndef __wrappedlibmvecDEFS_H_ +#define __wrappedlibmvecDEFS_H_ + + +#endif // __wrappedlibmvecDEFS_H_ diff --git a/src/wrapped/generated/wrappedlibmvectypes.h b/src/wrapped/generated/wrappedlibmvectypes.h new file mode 100644 index 0000000000..45d7e546ca --- /dev/null +++ b/src/wrapped/generated/wrappedlibmvectypes.h @@ -0,0 +1,20 @@ +/******************************************************************* + * File automatically generated by rebuild_wrappers.py (v2.5.0.24) * + *******************************************************************/ +#ifndef __wrappedlibmvecTYPES_H_ +#define __wrappedlibmvecTYPES_H_ + +#ifndef LIBNAME +#error You should only #include this file inside a wrapped*.c file +#endif +#ifndef ADDED_FUNCTIONS +#define ADDED_FUNCTIONS() +#endif + +typedef void (*vFv_t)(void); + +#define SUPER() ADDED_FUNCTIONS() \ + GO(_ZGVbN2v_cos, vFv_t) \ + GO(_ZGVbN2v_sin, vFv_t) + +#endif // __wrappedlibmvecTYPES_H_ diff --git a/src/wrapped/generated/wrappedlibmvecundefs.h b/src/wrapped/generated/wrappedlibmvecundefs.h new file mode 100644 index 0000000000..e79e3c7e39 --- /dev/null +++ b/src/wrapped/generated/wrappedlibmvecundefs.h @@ -0,0 +1,8 @@ +/******************************************************************* + * File automatically generated by rebuild_wrappers.py (v2.5.0.24) * + *******************************************************************/ +#ifndef __wrappedlibmvecUNDEFS_H_ +#define __wrappedlibmvecUNDEFS_H_ + + +#endif // __wrappedlibmvecUNDEFS_H_ diff --git a/src/wrapped/wrappedlibmvec.c b/src/wrapped/wrappedlibmvec.c new file mode 100644 index 0000000000..ff01626321 --- /dev/null +++ b/src/wrapped/wrappedlibmvec.c @@ -0,0 +1,86 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#include "wrappedlibs.h" + +#include "wrapper.h" +#include "bridge.h" +#include "librarian/library_private.h" +#include "x64emu.h" +#include "debug.h" +#include "emu/x64emu_private.h" + +const char* libmvecName = "libmvec.so.1"; +#define LIBNAME libmvec + +// GCC vector type: 2 packed doubles in 128 bits. +// Matches x86 SSE2 __m128d and aarch64 NEON float64x2_t layout. +typedef double v2df __attribute__((vector_size(16))); +typedef v2df (*v2df_func_t)(v2df); + +static v2df_func_t native_v2d_sin = NULL; +static v2df_func_t native_v2d_cos = NULL; + +// Native libmvec symbol names per architecture. +// aarch64 NEON: _ZGVnN2v_* (n=NEON, N=no mask, 2=two doubles, v=vector arg) +// x86_64 SSE2: _ZGVbN2v_* (b=SSE, N=no mask, 2=two doubles, v=vector arg) +#ifdef ARM64 +#define NATIVE_SIN_NAME "_ZGVnN2v_sin" +#define NATIVE_COS_NAME "_ZGVnN2v_cos" +#else +#define NATIVE_SIN_NAME "_ZGVbN2v_sin" +#define NATIVE_COS_NAME "_ZGVbN2v_cos" +#endif + +// _ZGVbN2v_sin: SSE2 vectorized sin(__m128d) -> __m128d +// Input: xmm0 = {double, double} (2 packed doubles) +// Output: xmm0 = {sin(d0), sin(d1)} +EXPORT void my__ZGVbN2v_sin(x64emu_t* emu) +{ + if (native_v2d_sin) { + v2df input, result; + memcpy(&input, &emu->xmm[0], sizeof(v2df)); + result = native_v2d_sin(input); + memcpy(&emu->xmm[0], &result, sizeof(v2df)); + } else { + emu->xmm[0].d[0] = sin(emu->xmm[0].d[0]); + emu->xmm[0].d[1] = sin(emu->xmm[0].d[1]); + } +} + +// _ZGVbN2v_cos: SSE2 vectorized cos(__m128d) -> __m128d +// Input: xmm0 = {double, double} (2 packed doubles) +// Output: xmm0 = {cos(d0), cos(d1)} +EXPORT void my__ZGVbN2v_cos(x64emu_t* emu) +{ + if (native_v2d_cos) { + v2df input, result; + memcpy(&input, &emu->xmm[0], sizeof(v2df)); + result = native_v2d_cos(input); + memcpy(&emu->xmm[0], &result, sizeof(v2df)); + } else { + emu->xmm[0].d[0] = cos(emu->xmm[0].d[0]); + emu->xmm[0].d[1] = cos(emu->xmm[0].d[1]); + } +} + +// Try to load the native libmvec.so.1 and resolve vectorized sin/cos. +// On aarch64 and x86_64: native libmvec exists in glibc, use it. +// On ppc64le and others: dlopen fails, fall back to scalar sin/cos. +#define PRE_INIT \ + if (1) { \ + void* native = dlopen("libmvec.so.1", RTLD_LAZY); \ + if (native) { \ + lib->w.lib = native; \ + native_v2d_sin = (v2df_func_t)dlsym(native, NATIVE_SIN_NAME); \ + native_v2d_cos = (v2df_func_t)dlsym(native, NATIVE_COS_NAME); \ + } else { \ + lib->w.lib = dlopen(NULL, RTLD_LAZY | RTLD_GLOBAL); \ + } \ + } else + +#include "wrappedlib_init.h" diff --git a/src/wrapped/wrappedlibmvec_private.h b/src/wrapped/wrappedlibmvec_private.h new file mode 100644 index 0000000000..878f248b8a --- /dev/null +++ b/src/wrapped/wrappedlibmvec_private.h @@ -0,0 +1,235 @@ +#if !(defined(GO) && defined(GOM) && defined(GO2) && defined(DATA)) +#error meh! +#endif + +// SSE2 2x double (b = SSE, N2 = 2 elements, v = vector) +//GO(_ZGVbN2v_acos, vFv) +//GO(_ZGVbN2v_acosh, vFv) +//GO(_ZGVbN2v_asin, vFv) +//GO(_ZGVbN2v_asinh, vFv) +//GO(_ZGVbN2v_atan, vFv) +//GO(_ZGVbN2v_atanh, vFv) +//GO(_ZGVbN2v_cbrt, vFv) +GOM(_ZGVbN2v_cos, vFEv) +//GO(_ZGVbN2v_cosh, vFv) +//GO(_ZGVbN2v_erf, vFv) +//GO(_ZGVbN2v_erfc, vFv) +//GO(_ZGVbN2v_exp, vFv) +//GO(_ZGVbN2v_exp10, vFv) +//GO(_ZGVbN2v_exp2, vFv) +//GO(_ZGVbN2v_expm1, vFv) +//GO(_ZGVbN2v_log, vFv) +//GO(_ZGVbN2v_log10, vFv) +//GO(_ZGVbN2v_log1p, vFv) +//GO(_ZGVbN2v_log2, vFv) +GOM(_ZGVbN2v_sin, vFEv) +//GO(_ZGVbN2v_sinh, vFv) +//GO(_ZGVbN2v_tan, vFv) +//GO(_ZGVbN2v_tanh, vFv) +//GO(_ZGVbN2vv_atan2, vFv) +//GO(_ZGVbN2vv_hypot, vFv) +//GO(_ZGVbN2vv_pow, vFv) +//GO(_ZGVbN2vvv_sincos, vFv) + +// SSE2 4x float (b = SSE, N4 = 4 elements, v = vector) +//GO(_ZGVbN4v_acosf, vFv) +//GO(_ZGVbN4v_acoshf, vFv) +//GO(_ZGVbN4v_asinf, vFv) +//GO(_ZGVbN4v_asinhf, vFv) +//GO(_ZGVbN4v_atanf, vFv) +//GO(_ZGVbN4v_atanhf, vFv) +//GO(_ZGVbN4v_cbrtf, vFv) +//GO(_ZGVbN4v_cosf, vFv) +//GO(_ZGVbN4v_coshf, vFv) +//GO(_ZGVbN4v_erfcf, vFv) +//GO(_ZGVbN4v_erff, vFv) +//GO(_ZGVbN4v_exp10f, vFv) +//GO(_ZGVbN4v_exp2f, vFv) +//GO(_ZGVbN4v_expf, vFv) +//GO(_ZGVbN4v_expm1f, vFv) +//GO(_ZGVbN4v_log10f, vFv) +//GO(_ZGVbN4v_log1pf, vFv) +//GO(_ZGVbN4v_log2f, vFv) +//GO(_ZGVbN4v_logf, vFv) +//GO(_ZGVbN4v_sinf, vFv) +//GO(_ZGVbN4v_sinhf, vFv) +//GO(_ZGVbN4v_tanf, vFv) +//GO(_ZGVbN4v_tanhf, vFv) +//GO(_ZGVbN4vv_atan2f, vFv) +//GO(_ZGVbN4vv_hypotf, vFv) +//GO(_ZGVbN4vv_powf, vFv) +//GO(_ZGVbN4vvv_sincosf, vFv) + +// AVX 4x double (c = AVX, N4 = 4 elements, v = vector) +//GO(_ZGVcN4v_acos, vFv) +//GO(_ZGVcN4v_acosh, vFv) +//GO(_ZGVcN4v_asin, vFv) +//GO(_ZGVcN4v_asinh, vFv) +//GO(_ZGVcN4v_atan, vFv) +//GO(_ZGVcN4v_atanh, vFv) +//GO(_ZGVcN4v_cbrt, vFv) +//GO(_ZGVcN4v_cos, vFv) +//GO(_ZGVcN4v_cosh, vFv) +//GO(_ZGVcN4v_erf, vFv) +//GO(_ZGVcN4v_erfc, vFv) +//GO(_ZGVcN4v_exp, vFv) +//GO(_ZGVcN4v_exp10, vFv) +//GO(_ZGVcN4v_exp2, vFv) +//GO(_ZGVcN4v_expm1, vFv) +//GO(_ZGVcN4v_log, vFv) +//GO(_ZGVcN4v_log10, vFv) +//GO(_ZGVcN4v_log1p, vFv) +//GO(_ZGVcN4v_log2, vFv) +//GO(_ZGVcN4v_sin, vFv) +//GO(_ZGVcN4v_sinh, vFv) +//GO(_ZGVcN4v_tan, vFv) +//GO(_ZGVcN4v_tanh, vFv) +//GO(_ZGVcN4vv_atan2, vFv) +//GO(_ZGVcN4vv_hypot, vFv) +//GO(_ZGVcN4vv_pow, vFv) +//GO(_ZGVcN4vvv_sincos, vFv) + +// AVX 8x float (c = AVX, N8 = 8 elements, v = vector) +//GO(_ZGVcN8v_acosf, vFv) +//GO(_ZGVcN8v_acoshf, vFv) +//GO(_ZGVcN8v_asinf, vFv) +//GO(_ZGVcN8v_asinhf, vFv) +//GO(_ZGVcN8v_atanf, vFv) +//GO(_ZGVcN8v_atanhf, vFv) +//GO(_ZGVcN8v_cbrtf, vFv) +//GO(_ZGVcN8v_cosf, vFv) +//GO(_ZGVcN8v_coshf, vFv) +//GO(_ZGVcN8v_erfcf, vFv) +//GO(_ZGVcN8v_erff, vFv) +//GO(_ZGVcN8v_exp10f, vFv) +//GO(_ZGVcN8v_exp2f, vFv) +//GO(_ZGVcN8v_expf, vFv) +//GO(_ZGVcN8v_expm1f, vFv) +//GO(_ZGVcN8v_log10f, vFv) +//GO(_ZGVcN8v_log1pf, vFv) +//GO(_ZGVcN8v_log2f, vFv) +//GO(_ZGVcN8v_logf, vFv) +//GO(_ZGVcN8v_sinf, vFv) +//GO(_ZGVcN8v_sinhf, vFv) +//GO(_ZGVcN8v_tanf, vFv) +//GO(_ZGVcN8v_tanhf, vFv) +//GO(_ZGVcN8vv_atan2f, vFv) +//GO(_ZGVcN8vv_hypotf, vFv) +//GO(_ZGVcN8vv_powf, vFv) +//GO(_ZGVcN8vvv_sincosf, vFv) + +// AVX2 4x double (d = AVX2, N4 = 4 elements, v = vector) +//GO(_ZGVdN4v_acos, vFv) +//GO(_ZGVdN4v_acosh, vFv) +//GO(_ZGVdN4v_asin, vFv) +//GO(_ZGVdN4v_asinh, vFv) +//GO(_ZGVdN4v_atan, vFv) +//GO(_ZGVdN4v_atanh, vFv) +//GO(_ZGVdN4v_cbrt, vFv) +//GO(_ZGVdN4v_cos, vFv) +//GO(_ZGVdN4v_cosh, vFv) +//GO(_ZGVdN4v_erf, vFv) +//GO(_ZGVdN4v_erfc, vFv) +//GO(_ZGVdN4v_exp, vFv) +//GO(_ZGVdN4v_exp10, vFv) +//GO(_ZGVdN4v_exp2, vFv) +//GO(_ZGVdN4v_expm1, vFv) +//GO(_ZGVdN4v_log, vFv) +//GO(_ZGVdN4v_log10, vFv) +//GO(_ZGVdN4v_log1p, vFv) +//GO(_ZGVdN4v_log2, vFv) +//GO(_ZGVdN4v_sin, vFv) +//GO(_ZGVdN4v_sinh, vFv) +//GO(_ZGVdN4v_tan, vFv) +//GO(_ZGVdN4v_tanh, vFv) +//GO(_ZGVdN4vv_atan2, vFv) +//GO(_ZGVdN4vv_hypot, vFv) +//GO(_ZGVdN4vv_pow, vFv) +//GO(_ZGVdN4vvv_sincos, vFv) + +// AVX2 8x float (d = AVX2, N8 = 8 elements, v = vector) +//GO(_ZGVdN8v_acosf, vFv) +//GO(_ZGVdN8v_acoshf, vFv) +//GO(_ZGVdN8v_asinf, vFv) +//GO(_ZGVdN8v_asinhf, vFv) +//GO(_ZGVdN8v_atanf, vFv) +//GO(_ZGVdN8v_atanhf, vFv) +//GO(_ZGVdN8v_cbrtf, vFv) +//GO(_ZGVdN8v_cosf, vFv) +//GO(_ZGVdN8v_coshf, vFv) +//GO(_ZGVdN8v_erfcf, vFv) +//GO(_ZGVdN8v_erff, vFv) +//GO(_ZGVdN8v_exp10f, vFv) +//GO(_ZGVdN8v_exp2f, vFv) +//GO(_ZGVdN8v_expf, vFv) +//GO(_ZGVdN8v_expm1f, vFv) +//GO(_ZGVdN8v_log10f, vFv) +//GO(_ZGVdN8v_log1pf, vFv) +//GO(_ZGVdN8v_log2f, vFv) +//GO(_ZGVdN8v_logf, vFv) +//GO(_ZGVdN8v_sinf, vFv) +//GO(_ZGVdN8v_sinhf, vFv) +//GO(_ZGVdN8v_tanf, vFv) +//GO(_ZGVdN8v_tanhf, vFv) +//GO(_ZGVdN8vv_atan2f, vFv) +//GO(_ZGVdN8vv_hypotf, vFv) +//GO(_ZGVdN8vv_powf, vFv) +//GO(_ZGVdN8vvv_sincosf, vFv) + +// AVX-512 8x double (e = AVX-512, N8 = 8 elements, v = vector) +//GO(_ZGVeN8v_acos, vFv) +//GO(_ZGVeN8v_acosh, vFv) +//GO(_ZGVeN8v_asin, vFv) +//GO(_ZGVeN8v_asinh, vFv) +//GO(_ZGVeN8v_atan, vFv) +//GO(_ZGVeN8v_atanh, vFv) +//GO(_ZGVeN8v_cbrt, vFv) +//GO(_ZGVeN8v_cos, vFv) +//GO(_ZGVeN8v_cosh, vFv) +//GO(_ZGVeN8v_erf, vFv) +//GO(_ZGVeN8v_erfc, vFv) +//GO(_ZGVeN8v_exp, vFv) +//GO(_ZGVeN8v_exp10, vFv) +//GO(_ZGVeN8v_exp2, vFv) +//GO(_ZGVeN8v_expm1, vFv) +//GO(_ZGVeN8v_log, vFv) +//GO(_ZGVeN8v_log10, vFv) +//GO(_ZGVeN8v_log1p, vFv) +//GO(_ZGVeN8v_log2, vFv) +//GO(_ZGVeN8v_sin, vFv) +//GO(_ZGVeN8v_sinh, vFv) +//GO(_ZGVeN8v_tan, vFv) +//GO(_ZGVeN8v_tanh, vFv) +//GO(_ZGVeN8vv_atan2, vFv) +//GO(_ZGVeN8vv_hypot, vFv) +//GO(_ZGVeN8vv_pow, vFv) +//GO(_ZGVeN8vvv_sincos, vFv) + +// AVX-512 16x float (e = AVX-512, N16 = 16 elements, v = vector) +//GO(_ZGVeN16v_acosf, vFv) +//GO(_ZGVeN16v_acoshf, vFv) +//GO(_ZGVeN16v_asinf, vFv) +//GO(_ZGVeN16v_asinhf, vFv) +//GO(_ZGVeN16v_atanf, vFv) +//GO(_ZGVeN16v_atanhf, vFv) +//GO(_ZGVeN16v_cbrtf, vFv) +//GO(_ZGVeN16v_cosf, vFv) +//GO(_ZGVeN16v_coshf, vFv) +//GO(_ZGVeN16v_erfcf, vFv) +//GO(_ZGVeN16v_erff, vFv) +//GO(_ZGVeN16v_exp10f, vFv) +//GO(_ZGVeN16v_exp2f, vFv) +//GO(_ZGVeN16v_expf, vFv) +//GO(_ZGVeN16v_expm1f, vFv) +//GO(_ZGVeN16v_log10f, vFv) +//GO(_ZGVeN16v_log1pf, vFv) +//GO(_ZGVeN16v_log2f, vFv) +//GO(_ZGVeN16v_logf, vFv) +//GO(_ZGVeN16v_sinf, vFv) +//GO(_ZGVeN16v_sinhf, vFv) +//GO(_ZGVeN16v_tanf, vFv) +//GO(_ZGVeN16v_tanhf, vFv) +//GO(_ZGVeN16vv_atan2f, vFv) +//GO(_ZGVeN16vv_hypotf, vFv) +//GO(_ZGVeN16vv_powf, vFv) +//GO(_ZGVeN16vvv_sincosf, vFv)