From 39958dbee7f992bcf37ced23b3adaeeb162a7d54 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Wed, 9 Apr 2025 05:13:40 -0700 Subject: [PATCH] Make `disable_fpu_denormals` return the previous `fpu_state`. PiperOrigin-RevId: 745541700 --- src/gcd.c | 4 +- src/portable-api.c | 108 ++++++++++++++--------------------------- src/pthreads.c | 15 ++---- src/threadpool-utils.h | 27 ++++++----- src/windows.c | 6 +-- 5 files changed, 59 insertions(+), 101 deletions(-) diff --git a/src/gcd.c b/src/gcd.c index fddf499..9a2668a 100644 --- a/src/gcd.c +++ b/src/gcd.c @@ -26,6 +26,7 @@ #include /* Internal library headers */ +#include #include "threadpool-atomics.h" #include "threadpool-object.h" #include "threadpool-utils.h" @@ -41,8 +42,7 @@ static void thread_main(void* arg, size_t thread_index) { struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } thread_function(threadpool, thread); diff --git a/src/portable-api.c b/src/portable-api.c index 10a8a55..3517ab2 100644 --- a/src/portable-api.c +++ b/src/portable-api.c @@ -2769,8 +2769,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_1d(struct pthreadpool* threadpool, /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range; i++) { function(context, i); @@ -2800,8 +2799,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_with_thread( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range; i++) { function(context, 0, i); @@ -2844,8 +2842,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_with_uarch( struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range; i++) { function(context, uarch_index, i); @@ -2881,8 +2878,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_tile_1d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range; i += tile) { function(context, i, min(range - i, tile)); @@ -2919,8 +2915,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_tile_1d_dynamic( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } function(context, 0, range); if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { @@ -2950,8 +2945,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_2d(pthreadpool_t threadpool, /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -2988,8 +2982,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_with_thread( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -3029,8 +3022,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j += tile_j) { @@ -3083,8 +3075,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d_with_uarch( struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j += tile_j) { @@ -3128,8 +3119,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d_dynamic( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t index_i = 0; index_i < range_i; index_i++) { function(context, index_i, /*index_j=*/0, range_j); @@ -3173,8 +3163,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j += tile_j) { @@ -3220,8 +3209,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i += tile_i) { for (size_t j = 0; j < range_j; j += tile_j) { @@ -3266,8 +3254,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d_dynamic( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } if (range_j <= tile_j) { function(context, /*index_i=*/0, /*index_j=*/0, range_i, range_j); @@ -3315,8 +3302,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } if (range_j <= tile_j) { function(context, uarch_index, /*index_i=*/0, /*index_j=*/0, range_i, @@ -3370,8 +3356,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d_with_uarch( struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i += tile_i) { for (size_t j = 0; j < range_j; j += tile_j) { @@ -3422,8 +3407,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_3d(pthreadpool_t threadpool, /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -3464,8 +3448,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -3512,8 +3495,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d_with_thread( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -3571,8 +3553,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d_with_uarch( struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -3632,8 +3613,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -3682,8 +3662,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_2d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j += tile_j) { @@ -3731,8 +3710,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_2d_dynamic( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } if (range_k <= tile_k) { for (size_t index_i = 0; index_i < range_i; index_i++) { @@ -3787,8 +3765,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } if (range_k <= tile_k) { for (size_t index_i = 0; index_i < range_i; index_i++) { @@ -3847,8 +3824,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_2d_with_uarch( struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j += tile_j) { @@ -3903,8 +3879,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_4d(pthreadpool_t threadpool, /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -3950,8 +3925,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_1d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -4002,8 +3976,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -4068,8 +4041,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d_with_uarch( struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -4125,8 +4097,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d_dynamic( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } if (range_l <= tile_l) { for (size_t index_i = 0; index_i < range_i; index_i++) { @@ -4187,8 +4158,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } if (range_l <= tile_l) { for (size_t index_i = 0; index_i < range_i; index_i++) { @@ -4243,8 +4213,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_5d(pthreadpool_t threadpool, /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -4293,8 +4262,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_5d_tile_1d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -4350,8 +4318,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_5d_tile_2d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -4408,8 +4375,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_6d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -4463,8 +4429,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_6d_tile_1d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { @@ -4523,8 +4488,7 @@ PTHREADPOOL_WEAK void pthreadpool_parallelize_6d_tile_2d( /* No thread pool used: execute task sequentially on the calling thread */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { diff --git a/src/pthreads.c b/src/pthreads.c index e9f492c..888ded6 100644 --- a/src/pthreads.c +++ b/src/pthreads.c @@ -212,6 +212,10 @@ static void* thread_main(void* arg) { struct fpu_state saved_fpu_state = {0}; uint32_t flags = 0; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + disable_fpu_denormals(); + } + /* Check in */ checkin_worker_thread(threadpool); @@ -228,15 +232,7 @@ static void* thread_main(void* arg) { const thread_function_t thread_function = (thread_function_t)pthreadpool_load_relaxed_void_p( &threadpool->thread_function); - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - thread_function(threadpool, thread); - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } break; } case threadpool_command_shutdown: @@ -417,8 +413,7 @@ PTHREADPOOL_INTERNAL void pthreadpool_parallelize( /* Save and modify FPU denormals control, if needed */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } /* Do computations as worker #0 */ diff --git a/src/threadpool-utils.h b/src/threadpool-utils.h index 2e33d90..7ccb441 100644 --- a/src/threadpool-utils.h +++ b/src/threadpool-utils.h @@ -76,24 +76,23 @@ static inline void set_fpu_state(const struct fpu_state state) { #endif } -static inline void disable_fpu_denormals() { +static inline struct fpu_state disable_fpu_denormals() { + struct fpu_state state = {0}; #if defined(_MSC_VER) && defined(_M_ARM) - int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0); - fpscr |= 0x1000000; - _MoveToCoprocessor(fpscr, 10, 7, 1, 0, 0); + state.fpscr = (uint32_t)_MoveFromCoprocessor(10, 7, 1, 0, 0); + _MoveToCoprocessor(state.fpscr | 0x1000000, 10, 7, 1, 0, 0); #elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) - __int64 fpcr = _ReadStatusReg(0x5A20); - fpcr |= 0x1080000; - _WriteStatusReg(0x5A20, fpcr); + state.fpcr = (uint64_t)_ReadStatusReg(0x5A20); + _WriteStatusReg(0x5A20, state.fpcr | 0x1080000); #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \ (__ARM_FP != 0) - uint32_t fpscr; + __asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r"(state.fpscr)); #if defined(__thumb__) && !defined(__thumb2__) __asm__ __volatile__( "VMRS %[fpscr], fpscr\n" "ORRS %[fpscr], %[bitmask]\n" "VMSR fpscr, %[fpscr]\n" - : [fpscr] "=l"(fpscr) + : [fpscr] "=l"(state.fpscr) : [bitmask] "l"(0x1000000) : "cc"); #else @@ -101,20 +100,22 @@ static inline void disable_fpu_denormals() { "VMRS %[fpscr], fpscr\n" "ORR %[fpscr], #0x1000000\n" "VMSR fpscr, %[fpscr]\n" - : [fpscr] "=r"(fpscr)); + : [fpscr] "=r"(state.fpscr)); #endif #elif defined(__GNUC__) && defined(__aarch64__) - uint64_t fpcr; + __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r"(state.fpcr)); __asm__ __volatile__( "MRS %[fpcr], fpcr\n" "ORR %w[fpcr], %w[fpcr], 0x1000000\n" "ORR %w[fpcr], %w[fpcr], 0x80000\n" "MSR fpcr, %[fpcr]\n" - : [fpcr] "=r"(fpcr)); + : [fpcr] "=r"(state.fpcr)); #elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \ (defined(_M_IX86_FP) && _M_IX86_FP >= 1) - _mm_setcsr(_mm_getcsr() | 0x8040); + state.mxcsr = (uint32_t)_mm_getcsr(); + _mm_setcsr(state.mxcsr | 0x8040); #endif + return state; } static inline size_t modulo_decrement(size_t i, size_t n) { diff --git a/src/windows.c b/src/windows.c index d3e9575..e9b6bfc 100644 --- a/src/windows.c +++ b/src/windows.c @@ -122,8 +122,7 @@ static DWORD WINAPI thread_main(LPVOID arg) { (thread_function_t)pthreadpool_load_relaxed_void_p( &threadpool->thread_function); if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } thread_function(threadpool, thread); @@ -296,8 +295,7 @@ PTHREADPOOL_INTERNAL void pthreadpool_parallelize( /* Save and modify FPU denormals control, if needed */ struct fpu_state saved_fpu_state = {0}; if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); + saved_fpu_state = disable_fpu_denormals(); } /* Do computations as worker #0 */