Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,7 @@ set(WRAPPEDS
"${BOX64_ROOT}/src/wrapped/wrappedlibice.c"
"${BOX64_ROOT}/src/wrapped/wrappedliblapack.c"
"${BOX64_ROOT}/src/wrapped/wrappedlibm.c"
"${BOX64_ROOT}/src/wrapped/wrappedlibmvec.c"
"${BOX64_ROOT}/src/wrapped/wrappedlibncurses.c"
"${BOX64_ROOT}/src/wrapped/wrappedlibncurses6.c"
"${BOX64_ROOT}/src/wrapped/wrappedlibncursesw.c"
Expand Down
1 change: 1 addition & 0 deletions src/library_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ GO("libandroid-shmem.so", androidshmem)

GO("libc.so.6", libc)
GO("libm.so.6", libm)
GO("libmvec.so.1", libmvec)
GO("libdl.so.2", libdl)
GO("libpthread.so.0", libpthread)
GO("libcups.so.2", libcups)
Expand Down
4 changes: 4 additions & 0 deletions src/wrapped/generated/functions_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5992,6 +5992,10 @@ wrappedlibm:
- __pow_finite
- DFDD:
- __powl_finite
wrappedlibmvec:
- vFv:
- _ZGVbN2v_cos
- _ZGVbN2v_sin
wrappedlibncurses:
- pFv:
- initscr
Expand Down
8 changes: 8 additions & 0 deletions src/wrapped/generated/wrappedlibmvecdefs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/*******************************************************************
* File automatically generated by rebuild_wrappers.py (v2.5.0.24) *
*******************************************************************/
#ifndef __wrappedlibmvecDEFS_H_
#define __wrappedlibmvecDEFS_H_


#endif // __wrappedlibmvecDEFS_H_
20 changes: 20 additions & 0 deletions src/wrapped/generated/wrappedlibmvectypes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*******************************************************************
* File automatically generated by rebuild_wrappers.py (v2.5.0.24) *
*******************************************************************/
#ifndef __wrappedlibmvecTYPES_H_
#define __wrappedlibmvecTYPES_H_

#ifndef LIBNAME
#error You should only #include this file inside a wrapped*.c file
#endif
#ifndef ADDED_FUNCTIONS
#define ADDED_FUNCTIONS()
#endif

typedef void (*vFv_t)(void);

#define SUPER() ADDED_FUNCTIONS() \
GO(_ZGVbN2v_cos, vFv_t) \
GO(_ZGVbN2v_sin, vFv_t)

#endif // __wrappedlibmvecTYPES_H_
8 changes: 8 additions & 0 deletions src/wrapped/generated/wrappedlibmvecundefs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/*******************************************************************
* File automatically generated by rebuild_wrappers.py (v2.5.0.24) *
*******************************************************************/
#ifndef __wrappedlibmvecUNDEFS_H_
#define __wrappedlibmvecUNDEFS_H_


#endif // __wrappedlibmvecUNDEFS_H_
86 changes: 86 additions & 0 deletions src/wrapped/wrappedlibmvec.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dlfcn.h>
#include <math.h>

#include "wrappedlibs.h"

#include "wrapper.h"
#include "bridge.h"
#include "librarian/library_private.h"
#include "x64emu.h"
#include "debug.h"
#include "emu/x64emu_private.h"

const char* libmvecName = "libmvec.so.1";
#define LIBNAME libmvec

// GCC vector type: 2 packed doubles in 128 bits.
// Matches x86 SSE2 __m128d and aarch64 NEON float64x2_t layout.
typedef double v2df __attribute__((vector_size(16)));
typedef v2df (*v2df_func_t)(v2df);

static v2df_func_t native_v2d_sin = NULL;
static v2df_func_t native_v2d_cos = NULL;

// Native libmvec symbol names per architecture.
// aarch64 NEON: _ZGVnN2v_* (n=NEON, N=no mask, 2=two doubles, v=vector arg)
// x86_64 SSE2: _ZGVbN2v_* (b=SSE, N=no mask, 2=two doubles, v=vector arg)
#ifdef ARM64
#define NATIVE_SIN_NAME "_ZGVnN2v_sin"
#define NATIVE_COS_NAME "_ZGVnN2v_cos"
#else
#define NATIVE_SIN_NAME "_ZGVbN2v_sin"
#define NATIVE_COS_NAME "_ZGVbN2v_cos"
#endif

// _ZGVbN2v_sin: SSE2 vectorized sin(__m128d) -> __m128d
// Input: xmm0 = {double, double} (2 packed doubles)
// Output: xmm0 = {sin(d0), sin(d1)}
EXPORT void my__ZGVbN2v_sin(x64emu_t* emu)
{
if (native_v2d_sin) {
v2df input, result;
memcpy(&input, &emu->xmm[0], sizeof(v2df));
result = native_v2d_sin(input);
memcpy(&emu->xmm[0], &result, sizeof(v2df));
} else {
emu->xmm[0].d[0] = sin(emu->xmm[0].d[0]);
emu->xmm[0].d[1] = sin(emu->xmm[0].d[1]);
}
}

// _ZGVbN2v_cos: SSE2 vectorized cos(__m128d) -> __m128d
// Input: xmm0 = {double, double} (2 packed doubles)
// Output: xmm0 = {cos(d0), cos(d1)}
EXPORT void my__ZGVbN2v_cos(x64emu_t* emu)
{
if (native_v2d_cos) {
v2df input, result;
memcpy(&input, &emu->xmm[0], sizeof(v2df));
result = native_v2d_cos(input);
memcpy(&emu->xmm[0], &result, sizeof(v2df));
} else {
emu->xmm[0].d[0] = cos(emu->xmm[0].d[0]);
emu->xmm[0].d[1] = cos(emu->xmm[0].d[1]);
}
}

// Try to load the native libmvec.so.1 and resolve vectorized sin/cos.
// On aarch64 and x86_64: native libmvec exists in glibc, use it.
// On ppc64le and others: dlopen fails, fall back to scalar sin/cos.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hopefully one day https://sourceware.org/pipermail/glibc-cvs/2020q1/068984.html it would be upstreamed for ppc64

#define PRE_INIT \
if (1) { \
void* native = dlopen("libmvec.so.1", RTLD_LAZY); \
if (native) { \
lib->w.lib = native; \
native_v2d_sin = (v2df_func_t)dlsym(native, NATIVE_SIN_NAME); \
native_v2d_cos = (v2df_func_t)dlsym(native, NATIVE_COS_NAME); \
} else { \
lib->w.lib = dlopen(NULL, RTLD_LAZY | RTLD_GLOBAL); \
} \
} else

#include "wrappedlib_init.h"
235 changes: 235 additions & 0 deletions src/wrapped/wrappedlibmvec_private.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
#if !(defined(GO) && defined(GOM) && defined(GO2) && defined(DATA))
#error meh!
#endif

// SSE2 2x double (b = SSE, N2 = 2 elements, v = vector)
//GO(_ZGVbN2v_acos, vFv)
//GO(_ZGVbN2v_acosh, vFv)
//GO(_ZGVbN2v_asin, vFv)
//GO(_ZGVbN2v_asinh, vFv)
//GO(_ZGVbN2v_atan, vFv)
//GO(_ZGVbN2v_atanh, vFv)
//GO(_ZGVbN2v_cbrt, vFv)
GOM(_ZGVbN2v_cos, vFEv)
//GO(_ZGVbN2v_cosh, vFv)
//GO(_ZGVbN2v_erf, vFv)
//GO(_ZGVbN2v_erfc, vFv)
//GO(_ZGVbN2v_exp, vFv)
//GO(_ZGVbN2v_exp10, vFv)
//GO(_ZGVbN2v_exp2, vFv)
//GO(_ZGVbN2v_expm1, vFv)
//GO(_ZGVbN2v_log, vFv)
//GO(_ZGVbN2v_log10, vFv)
//GO(_ZGVbN2v_log1p, vFv)
//GO(_ZGVbN2v_log2, vFv)
GOM(_ZGVbN2v_sin, vFEv)
//GO(_ZGVbN2v_sinh, vFv)
//GO(_ZGVbN2v_tan, vFv)
//GO(_ZGVbN2v_tanh, vFv)
//GO(_ZGVbN2vv_atan2, vFv)
//GO(_ZGVbN2vv_hypot, vFv)
//GO(_ZGVbN2vv_pow, vFv)
//GO(_ZGVbN2vvv_sincos, vFv)

// SSE2 4x float (b = SSE, N4 = 4 elements, v = vector)
//GO(_ZGVbN4v_acosf, vFv)
//GO(_ZGVbN4v_acoshf, vFv)
//GO(_ZGVbN4v_asinf, vFv)
//GO(_ZGVbN4v_asinhf, vFv)
//GO(_ZGVbN4v_atanf, vFv)
//GO(_ZGVbN4v_atanhf, vFv)
//GO(_ZGVbN4v_cbrtf, vFv)
//GO(_ZGVbN4v_cosf, vFv)
//GO(_ZGVbN4v_coshf, vFv)
//GO(_ZGVbN4v_erfcf, vFv)
//GO(_ZGVbN4v_erff, vFv)
//GO(_ZGVbN4v_exp10f, vFv)
//GO(_ZGVbN4v_exp2f, vFv)
//GO(_ZGVbN4v_expf, vFv)
//GO(_ZGVbN4v_expm1f, vFv)
//GO(_ZGVbN4v_log10f, vFv)
//GO(_ZGVbN4v_log1pf, vFv)
//GO(_ZGVbN4v_log2f, vFv)
//GO(_ZGVbN4v_logf, vFv)
//GO(_ZGVbN4v_sinf, vFv)
//GO(_ZGVbN4v_sinhf, vFv)
//GO(_ZGVbN4v_tanf, vFv)
//GO(_ZGVbN4v_tanhf, vFv)
//GO(_ZGVbN4vv_atan2f, vFv)
//GO(_ZGVbN4vv_hypotf, vFv)
//GO(_ZGVbN4vv_powf, vFv)
//GO(_ZGVbN4vvv_sincosf, vFv)

// AVX 4x double (c = AVX, N4 = 4 elements, v = vector)
//GO(_ZGVcN4v_acos, vFv)
//GO(_ZGVcN4v_acosh, vFv)
//GO(_ZGVcN4v_asin, vFv)
//GO(_ZGVcN4v_asinh, vFv)
//GO(_ZGVcN4v_atan, vFv)
//GO(_ZGVcN4v_atanh, vFv)
//GO(_ZGVcN4v_cbrt, vFv)
//GO(_ZGVcN4v_cos, vFv)
//GO(_ZGVcN4v_cosh, vFv)
//GO(_ZGVcN4v_erf, vFv)
//GO(_ZGVcN4v_erfc, vFv)
//GO(_ZGVcN4v_exp, vFv)
//GO(_ZGVcN4v_exp10, vFv)
//GO(_ZGVcN4v_exp2, vFv)
//GO(_ZGVcN4v_expm1, vFv)
//GO(_ZGVcN4v_log, vFv)
//GO(_ZGVcN4v_log10, vFv)
//GO(_ZGVcN4v_log1p, vFv)
//GO(_ZGVcN4v_log2, vFv)
//GO(_ZGVcN4v_sin, vFv)
//GO(_ZGVcN4v_sinh, vFv)
//GO(_ZGVcN4v_tan, vFv)
//GO(_ZGVcN4v_tanh, vFv)
//GO(_ZGVcN4vv_atan2, vFv)
//GO(_ZGVcN4vv_hypot, vFv)
//GO(_ZGVcN4vv_pow, vFv)
//GO(_ZGVcN4vvv_sincos, vFv)

// AVX 8x float (c = AVX, N8 = 8 elements, v = vector)
//GO(_ZGVcN8v_acosf, vFv)
//GO(_ZGVcN8v_acoshf, vFv)
//GO(_ZGVcN8v_asinf, vFv)
//GO(_ZGVcN8v_asinhf, vFv)
//GO(_ZGVcN8v_atanf, vFv)
//GO(_ZGVcN8v_atanhf, vFv)
//GO(_ZGVcN8v_cbrtf, vFv)
//GO(_ZGVcN8v_cosf, vFv)
//GO(_ZGVcN8v_coshf, vFv)
//GO(_ZGVcN8v_erfcf, vFv)
//GO(_ZGVcN8v_erff, vFv)
//GO(_ZGVcN8v_exp10f, vFv)
//GO(_ZGVcN8v_exp2f, vFv)
//GO(_ZGVcN8v_expf, vFv)
//GO(_ZGVcN8v_expm1f, vFv)
//GO(_ZGVcN8v_log10f, vFv)
//GO(_ZGVcN8v_log1pf, vFv)
//GO(_ZGVcN8v_log2f, vFv)
//GO(_ZGVcN8v_logf, vFv)
//GO(_ZGVcN8v_sinf, vFv)
//GO(_ZGVcN8v_sinhf, vFv)
//GO(_ZGVcN8v_tanf, vFv)
//GO(_ZGVcN8v_tanhf, vFv)
//GO(_ZGVcN8vv_atan2f, vFv)
//GO(_ZGVcN8vv_hypotf, vFv)
//GO(_ZGVcN8vv_powf, vFv)
//GO(_ZGVcN8vvv_sincosf, vFv)

// AVX2 4x double (d = AVX2, N4 = 4 elements, v = vector)
//GO(_ZGVdN4v_acos, vFv)
//GO(_ZGVdN4v_acosh, vFv)
//GO(_ZGVdN4v_asin, vFv)
//GO(_ZGVdN4v_asinh, vFv)
//GO(_ZGVdN4v_atan, vFv)
//GO(_ZGVdN4v_atanh, vFv)
//GO(_ZGVdN4v_cbrt, vFv)
//GO(_ZGVdN4v_cos, vFv)
//GO(_ZGVdN4v_cosh, vFv)
//GO(_ZGVdN4v_erf, vFv)
//GO(_ZGVdN4v_erfc, vFv)
//GO(_ZGVdN4v_exp, vFv)
//GO(_ZGVdN4v_exp10, vFv)
//GO(_ZGVdN4v_exp2, vFv)
//GO(_ZGVdN4v_expm1, vFv)
//GO(_ZGVdN4v_log, vFv)
//GO(_ZGVdN4v_log10, vFv)
//GO(_ZGVdN4v_log1p, vFv)
//GO(_ZGVdN4v_log2, vFv)
//GO(_ZGVdN4v_sin, vFv)
//GO(_ZGVdN4v_sinh, vFv)
//GO(_ZGVdN4v_tan, vFv)
//GO(_ZGVdN4v_tanh, vFv)
//GO(_ZGVdN4vv_atan2, vFv)
//GO(_ZGVdN4vv_hypot, vFv)
//GO(_ZGVdN4vv_pow, vFv)
//GO(_ZGVdN4vvv_sincos, vFv)

// AVX2 8x float (d = AVX2, N8 = 8 elements, v = vector)
//GO(_ZGVdN8v_acosf, vFv)
//GO(_ZGVdN8v_acoshf, vFv)
//GO(_ZGVdN8v_asinf, vFv)
//GO(_ZGVdN8v_asinhf, vFv)
//GO(_ZGVdN8v_atanf, vFv)
//GO(_ZGVdN8v_atanhf, vFv)
//GO(_ZGVdN8v_cbrtf, vFv)
//GO(_ZGVdN8v_cosf, vFv)
//GO(_ZGVdN8v_coshf, vFv)
//GO(_ZGVdN8v_erfcf, vFv)
//GO(_ZGVdN8v_erff, vFv)
//GO(_ZGVdN8v_exp10f, vFv)
//GO(_ZGVdN8v_exp2f, vFv)
//GO(_ZGVdN8v_expf, vFv)
//GO(_ZGVdN8v_expm1f, vFv)
//GO(_ZGVdN8v_log10f, vFv)
//GO(_ZGVdN8v_log1pf, vFv)
//GO(_ZGVdN8v_log2f, vFv)
//GO(_ZGVdN8v_logf, vFv)
//GO(_ZGVdN8v_sinf, vFv)
//GO(_ZGVdN8v_sinhf, vFv)
//GO(_ZGVdN8v_tanf, vFv)
//GO(_ZGVdN8v_tanhf, vFv)
//GO(_ZGVdN8vv_atan2f, vFv)
//GO(_ZGVdN8vv_hypotf, vFv)
//GO(_ZGVdN8vv_powf, vFv)
//GO(_ZGVdN8vvv_sincosf, vFv)

// AVX-512 8x double (e = AVX-512, N8 = 8 elements, v = vector)
//GO(_ZGVeN8v_acos, vFv)
//GO(_ZGVeN8v_acosh, vFv)
//GO(_ZGVeN8v_asin, vFv)
//GO(_ZGVeN8v_asinh, vFv)
//GO(_ZGVeN8v_atan, vFv)
//GO(_ZGVeN8v_atanh, vFv)
//GO(_ZGVeN8v_cbrt, vFv)
//GO(_ZGVeN8v_cos, vFv)
//GO(_ZGVeN8v_cosh, vFv)
//GO(_ZGVeN8v_erf, vFv)
//GO(_ZGVeN8v_erfc, vFv)
//GO(_ZGVeN8v_exp, vFv)
//GO(_ZGVeN8v_exp10, vFv)
//GO(_ZGVeN8v_exp2, vFv)
//GO(_ZGVeN8v_expm1, vFv)
//GO(_ZGVeN8v_log, vFv)
//GO(_ZGVeN8v_log10, vFv)
//GO(_ZGVeN8v_log1p, vFv)
//GO(_ZGVeN8v_log2, vFv)
//GO(_ZGVeN8v_sin, vFv)
//GO(_ZGVeN8v_sinh, vFv)
//GO(_ZGVeN8v_tan, vFv)
//GO(_ZGVeN8v_tanh, vFv)
//GO(_ZGVeN8vv_atan2, vFv)
//GO(_ZGVeN8vv_hypot, vFv)
//GO(_ZGVeN8vv_pow, vFv)
//GO(_ZGVeN8vvv_sincos, vFv)

// AVX-512 16x float (e = AVX-512, N16 = 16 elements, v = vector)
//GO(_ZGVeN16v_acosf, vFv)
//GO(_ZGVeN16v_acoshf, vFv)
//GO(_ZGVeN16v_asinf, vFv)
//GO(_ZGVeN16v_asinhf, vFv)
//GO(_ZGVeN16v_atanf, vFv)
//GO(_ZGVeN16v_atanhf, vFv)
//GO(_ZGVeN16v_cbrtf, vFv)
//GO(_ZGVeN16v_cosf, vFv)
//GO(_ZGVeN16v_coshf, vFv)
//GO(_ZGVeN16v_erfcf, vFv)
//GO(_ZGVeN16v_erff, vFv)
//GO(_ZGVeN16v_exp10f, vFv)
//GO(_ZGVeN16v_exp2f, vFv)
//GO(_ZGVeN16v_expf, vFv)
//GO(_ZGVeN16v_expm1f, vFv)
//GO(_ZGVeN16v_log10f, vFv)
//GO(_ZGVeN16v_log1pf, vFv)
//GO(_ZGVeN16v_log2f, vFv)
//GO(_ZGVeN16v_logf, vFv)
//GO(_ZGVeN16v_sinf, vFv)
//GO(_ZGVeN16v_sinhf, vFv)
//GO(_ZGVeN16v_tanf, vFv)
//GO(_ZGVeN16v_tanhf, vFv)
//GO(_ZGVeN16vv_atan2f, vFv)
//GO(_ZGVeN16vv_hypotf, vFv)
//GO(_ZGVeN16vv_powf, vFv)
//GO(_ZGVeN16vvv_sincosf, vFv)
Loading