From 50ea0027a4f50ed0f2116f6120751116a5f8f135 Mon Sep 17 00:00:00 2001 From: anzz1 Date: Wed, 22 Mar 2023 03:46:48 +0200 Subject: [PATCH 1/4] Enable Fused-Multiply-Add (FMA) instructions on MSVC __FMA__ macro does not exist in MSVC --- ggml.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index ec00e631759..7edcf4c2f74 100644 --- a/ggml.c +++ b/ggml.c @@ -407,9 +407,16 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); #define QK 32 +#if __AVX2__ || __AVX512F__ + +// __FMA__ is not defined in MSVC, however it is implied with AVX2/AVX512 +#if defined(_MSC_VER) && !defined(__FMA__) +#define __FMA__ +#endif + // AVX routines provided by GH user Const-me // ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600 -#if __AVX2__ || __AVX512F__ + // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytesFromNibbles( const uint8_t* rsi ) From c43d45696f69b8b9ce582c6fe301e48e4bd11289 Mon Sep 17 00:00:00 2001 From: anzz1 Date: Wed, 22 Mar 2023 04:05:49 +0200 Subject: [PATCH 2/4] Enable F16C/CVT16 vector extensions on MSVC __F16C__ macro does not exist in MSVC, but is implied with AVX2/AVX512 --- ggml.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/ggml.c b/ggml.c index 7edcf4c2f74..639258c52d2 100644 --- a/ggml.c +++ b/ggml.c @@ -79,6 +79,16 @@ static int sched_yield (void) { typedef void* thread_ret_t; #endif +// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 +#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) +#ifndef __FMA__ +#define __FMA__ +#endif +#ifndef __F16C__ +#define __F16C__ +#endif +#endif + #ifdef __HAIKU__ #define static_assert(cond, msg) _Static_assert(cond, msg) #endif @@ -407,16 +417,9 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); #define QK 32 -#if __AVX2__ || __AVX512F__ - -// __FMA__ is not defined in MSVC, however it is implied with AVX2/AVX512 -#if defined(_MSC_VER) && !defined(__FMA__) -#define __FMA__ -#endif - // AVX routines provided by GH user Const-me // ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600 - +#if __AVX2__ || __AVX512F__ // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytesFromNibbles( const uint8_t* rsi ) From 67ad7c445c401af8a9022011194869a485b61894 Mon Sep 17 00:00:00 2001 From: anzz1 Date: Wed, 22 Mar 2023 04:36:29 +0200 Subject: [PATCH 3/4] MSVC cvt intrinsics --- ggml.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml.c b/ggml.c index 639258c52d2..95db777983f 100644 --- a/ggml.c +++ b/ggml.c @@ -182,8 +182,13 @@ typedef double ggml_float; #ifdef __F16C__ +#ifdef _MSC_VER +#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) +#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) +#else #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#endif #elif defined(__POWER9_VECTOR__) From b8a80f9994e6877cd075df8324740e951c268e65 Mon Sep 17 00:00:00 2001 From: anzz1 Date: Wed, 22 Mar 2023 05:47:05 +0200 Subject: [PATCH 4/4] Add __SSE3__ macro for MSVC too because why not even though it's not currently used for anything when AVX is defined --- ggml.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml.c b/ggml.c index 95db777983f..9a9c4bf338e 100644 --- a/ggml.c +++ b/ggml.c @@ -87,6 +87,9 @@ typedef void* thread_ret_t; #ifndef __F16C__ #define __F16C__ #endif +#ifndef __SSE3__ +#define __SSE3__ +#endif #endif #ifdef __HAIKU__