From c626e267f2146134c5e2e4efe931ec6bdccce92c Mon Sep 17 00:00:00 2001 From: Luca Giacchino Date: Tue, 9 Apr 2024 09:08:26 -0700 Subject: [PATCH 1/5] Implement AVX512 distance calculations for halfvec --- src/halfutils.c | 400 +++++++++++++++++++++++ src/halfutils.h | 3 + test/expected/halfvec_functions_fp16.out | 141 ++++++++ test/sql/halfvec_functions_fp16.sql | 31 ++ 4 files changed, 575 insertions(+) create mode 100644 test/expected/halfvec_functions_fp16.out create mode 100644 test/sql/halfvec_functions_fp16.sql diff --git a/src/halfutils.c b/src/halfutils.c index d16909409..9ef916803 100644 --- a/src/halfutils.c +++ b/src/halfutils.c @@ -2,6 +2,7 @@ #include "halfutils.h" #include "halfvec.h" +#include "utils/guc.h" #ifdef HALFVEC_DISPATCH #include @@ -12,13 +13,23 @@ #include #endif +#if (defined(__GNUC__) && (__GNUC__ >= 12)) || \ + (defined(__clang__) && (__clang_major__ >= 16)) || \ + (defined __AVX512FP16__) +#define HAVE_AVX512FP16 +#endif + #ifdef _MSC_VER #define TARGET_F16C +#define TARGET_AVX512FP16 #else #define TARGET_F16C __attribute__((target("avx,f16c,fma"))) +#define TARGET_AVX512FP16 __attribute__((target("avx512fp16,avx512f,avx512vl,avx512bw"))) #endif #endif +bool halfvec_use_fp16_compute; + float (*HalfvecL2SquaredDistance) (int dim, half * ax, half * bx); float (*HalfvecInnerProduct) (int dim, half * ax, half * bx); double (*HalfvecCosineSimilarity) (int dim, half * ax, half * bx); @@ -74,6 +85,85 @@ HalfvecL2SquaredDistanceF16c(int dim, half * ax, half * bx) return distance; } + +#ifdef HAVE_AVX512FP16 +TARGET_AVX512FP16 static float +HalfvecL2SquaredDistanceAvx512Fp16(int dim, half * ax, half * bx) +{ + float distance; + int i; + int count = (dim / 32) * 32; + unsigned long mask; + __m512h axi; + __m512h bxi; + __m512h diff; + __m512h dist = _mm512_setzero_ph(); + + for (i = 0; i < count; i += 32) + { + axi = _mm512_loadu_ph(ax+i); + bxi = _mm512_loadu_ph(bx+i); + diff = _mm512_sub_ph(axi, bxi); + dist = _mm512_fmadd_ph(diff, diff, dist); + } + + mask = (1 << (dim - i)) - 1; + axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + diff = _mm512_sub_ph(axi, bxi); + dist = _mm512_fmadd_ph(diff, diff, dist); + + distance = (float)_mm512_reduce_add_ph(dist); + + return distance; +} + +TARGET_AVX512FP16 static float +HalfvecL2SquaredDistanceAvx512Fp32(int dim, half * ax, half * bx) +{ + float distance; + int i; + int count = (dim / 16) * 16; + unsigned long mask; + __m256h axi; + __m256h bxi; + __m512 axs; + __m512 bxs; + __m512 diff; + __m512 dist = _mm512_setzero_ps(); + + for (i = 0; i < count; i += 16) + { + axi = _mm256_loadu_ph(ax+i); + bxi = _mm256_loadu_ph(bx+i); + axs = _mm512_cvtxph_ps(axi); + bxs = _mm512_cvtxph_ps(bxi); + diff = _mm512_sub_ps(axs, bxs); + dist = _mm512_fmadd_ps(diff, diff, dist); + } + + mask = (1 << (dim - i)) - 1; + axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + axs = _mm512_cvtxph_ps(axi); + bxs = _mm512_cvtxph_ps(bxi); + diff = _mm512_sub_ps(axs, bxs); + dist = _mm512_fmadd_ps(diff, diff, dist); + + distance = (float)_mm512_reduce_add_ps(dist); + + return distance; +} + +static float +HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx) +{ + if (halfvec_use_fp16_compute) + return HalfvecL2SquaredDistanceAvx512Fp16(dim, ax, bx); + else + return HalfvecL2SquaredDistanceAvx512Fp32(dim, ax, bx); +} +#endif #endif static float @@ -117,6 +207,79 @@ HalfvecInnerProductF16c(int dim, half * ax, half * bx) return distance; } + +#ifdef HAVE_AVX512FP16 +TARGET_AVX512FP16 static float +HalfvecInnerProductAvx512Fp16(int dim, half * ax, half * bx) +{ + float distance; + int i; + int count = (dim / 32) * 32; + unsigned long mask; + __m512h axi; + __m512h bxi; + __m512h dist = _mm512_setzero_ph(); + + for (i = 0; i < count; i += 32) + { + axi = _mm512_loadu_ph(ax+i); + bxi = _mm512_loadu_ph(bx+i); + dist = _mm512_fmadd_ph(axi, bxi, dist); + } + + mask = (1 << (dim - i)) - 1; + axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + dist = _mm512_fmadd_ph(axi, bxi, dist); + + distance = (float)_mm512_reduce_add_ph(dist); + + return distance; +} + +TARGET_AVX512FP16 static float +HalfvecInnerProductAvx512Fp32(int dim, half * ax, half * bx) +{ + float distance; + int i; + int count = (dim / 16) * 16; + unsigned long mask; + __m256h axi; + __m256h bxi; + __m512 axs; + __m512 bxs; + __m512 dist = _mm512_setzero_ps(); + + for (i = 0; i < count; i += 16) + { + axi = _mm256_loadu_ph(ax+i); + bxi = _mm256_loadu_ph(bx+i); + axs = _mm512_cvtxph_ps(axi); + bxs = _mm512_cvtxph_ps(bxi); + dist = _mm512_fmadd_ps(axs, bxs, dist); + } + + mask = (1 << (dim - i)) - 1; + axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + axs = _mm512_cvtxph_ps(axi); + bxs = _mm512_cvtxph_ps(bxi); + dist = _mm512_fmadd_ps(axs, bxs, dist); + + distance = (float)_mm512_reduce_add_ps(dist); + + return distance; +} + +static float +HalfvecInnerProductAvx512(int dim, half * ax, half * bx) +{ + if (halfvec_use_fp16_compute) + return HalfvecInnerProductAvx512Fp16(dim, ax, bx); + else + return HalfvecInnerProductAvx512Fp32(dim, ax, bx); +} +#endif #endif static double @@ -190,6 +353,101 @@ HalfvecCosineSimilarityF16c(int dim, half * ax, half * bx) /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ return (double) similarity / sqrt((double) norma * (double) normb); } + +#ifdef HAVE_AVX512FP16 +TARGET_AVX512FP16 static double +HalfvecCosineSimilarityAvx512Fp16(int dim, half * ax, half * bx) +{ + float similarity; + float norma; + float normb; + int i; + int count = (dim / 32) * 32; + unsigned long mask; + __m512h axi; + __m512h bxi; + __m512h sim = _mm512_setzero_ph(); + __m512h na = _mm512_setzero_ph(); + __m512h nb = _mm512_setzero_ph(); + + for (i = 0; i < count; i += 32) + { + axi = _mm512_loadu_ph(ax+i); + bxi = _mm512_loadu_ph(bx+i); + sim = _mm512_fmadd_ph(axi, bxi, sim); + na = _mm512_fmadd_ph(axi, axi, na); + nb = _mm512_fmadd_ph(bxi, bxi, nb); + } + + mask = (1 << (dim - i)) - 1; + axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + sim = _mm512_fmadd_ph(axi, bxi, sim); + na = _mm512_fmadd_ph(axi, axi, na); + nb = _mm512_fmadd_ph(bxi, bxi, nb); + + similarity = (float)_mm512_reduce_add_ph(sim); + norma = (float)_mm512_reduce_add_ph(na); + normb = (float)_mm512_reduce_add_ph(nb); + + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + return (double) similarity / sqrt((double) norma * (double) normb); +} + +TARGET_AVX512FP16 static double +HalfvecCosineSimilarityAvx512Fp32(int dim, half * ax, half * bx) +{ + float similarity; + float norma; + float normb; + int i; + int count = (dim / 16) * 16; + unsigned long mask; + __m256h axi; + __m256h bxi; + __m512 axs; + __m512 bxs; + __m512 sim = _mm512_setzero_ps(); + __m512 na = _mm512_setzero_ps(); + __m512 nb = _mm512_setzero_ps(); + + for (i = 0; i < count; i += 16) + { + axi = _mm256_loadu_ph(ax+i); + bxi = _mm256_loadu_ph(bx+i); + axs = _mm512_cvtxph_ps(axi); + bxs = _mm512_cvtxph_ps(bxi); + sim = _mm512_fmadd_ps(axs, bxs, sim); + na = _mm512_fmadd_ps(axs, axs, na); + nb = _mm512_fmadd_ps(bxs, bxs, nb); + } + + mask = (1 << (dim - i)) - 1; + axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + axs = _mm512_cvtxph_ps(axi); + bxs = _mm512_cvtxph_ps(bxi); + sim = _mm512_fmadd_ps(axs, bxs, sim); + na = _mm512_fmadd_ps(axs, axs, na); + nb = _mm512_fmadd_ps(bxs, bxs, nb); + + similarity = (float)_mm512_reduce_add_ps(sim); + norma = (float)_mm512_reduce_add_ps(na); + normb = (float)_mm512_reduce_add_ps(nb); + + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + return (double) similarity / sqrt((double) norma * (double) normb); +} + +static double +HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx) +{ + if (halfvec_use_fp16_compute) + return HalfvecCosineSimilarityAvx512Fp16(dim, ax, bx); + else + return HalfvecCosineSimilarityAvx512Fp32(dim, ax, bx); +} +#endif #endif static float @@ -235,6 +493,79 @@ HalfvecL1DistanceF16c(int dim, half * ax, half * bx) return distance; } + +#ifdef HAVE_AVX512FP16 +TARGET_AVX512FP16 static float +HalfvecL1DistanceAvx512Fp16(int dim, half * ax, half * bx) +{ + float distance; + int i; + int count = (dim / 32) * 32; + unsigned long mask; + __m512h axi; + __m512h bxi; + __m512h dist = _mm512_setzero_ph(); + + for (i = 0; i < count; i += 32) + { + axi = _mm512_loadu_ph(ax+i); + bxi = _mm512_loadu_ph(bx+i); + dist = _mm512_add_ph(dist, _mm512_abs_ph(_mm512_sub_ph(axi, bxi))); + } + + mask = (1 << (dim - i)) - 1; + axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + dist = _mm512_add_ph(dist, _mm512_abs_ph(_mm512_sub_ph(axi, bxi))); + + distance = (float)_mm512_reduce_add_ph(dist); + + return distance; +} + +TARGET_AVX512FP16 static float +HalfvecL1DistanceAvx512Fp32(int dim, half * ax, half * bx) +{ + float distance; + int i; + int count = (dim / 16) * 16; + unsigned long mask; + __m256h axi; + __m256h bxi; + __m512 axs; + __m512 bxs; + __m512 dist = _mm512_setzero_ps(); + + for (i = 0; i < count; i += 16) + { + axi = _mm256_loadu_ph(ax+i); + bxi = _mm256_loadu_ph(bx+i); + axs = _mm512_cvtxph_ps(axi); + bxs = _mm512_cvtxph_ps(bxi); + dist = _mm512_add_ps(dist, _mm512_abs_ps(_mm512_sub_ps(axs, bxs))); + } + + mask = (1 << (dim - i)) - 1; + axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + axs = _mm512_cvtxph_ps(axi); + bxs = _mm512_cvtxph_ps(bxi); + dist = _mm512_add_ps(dist, _mm512_abs_ps(_mm512_sub_ps(axs, bxs))); + + distance = (float)_mm512_reduce_add_ps(dist); + + return distance; +} + +static float +HalfvecL1DistanceAvx512(int dim, half * ax, half * bx) +{ + if (halfvec_use_fp16_compute) + return HalfvecL1DistanceAvx512Fp16(dim, ax, bx); + else + return HalfvecL1DistanceAvx512Fp32(dim, ax, bx); +} +#endif #endif #ifdef HALFVEC_DISPATCH @@ -271,6 +602,61 @@ SupportsCpuFeature(unsigned int feature) /* Now check features */ return (exx[2] & feature) == feature; } + +#ifdef HAVE_AVX512FP16 +TARGET_XSAVE static bool +SupportsOsXsave() +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#else + __cpuid(exx, 1); +#endif + + return (exx[2] & CPU_FEATURE_OSXSAVE) == CPU_FEATURE_OSXSAVE; +} + +#define CPU_FEATURE_AVX512F (1 << 16) +#define CPU_FEATURE_AVX512_FP16 (1 << 23) +#define CPU_FEATURE_AVX512_BW (1 << 30) +#define CPU_FEATURE_AVX512VL (1 << 31) + +TARGET_XSAVE static bool +SupportsAvx512Fp16() +{ + unsigned int exx[4] = {0, 0, 0, 0}; + + /* Check OS supports XSAVE */ + if (!SupportsOsXsave()) + return false; + + /* Check XMM, YMM, and ZMM registers are enabled */ + if ((_xgetbv(0) & 0xe6) != 0xe6) + return false; + +#if defined(HAVE__GET_CPUID) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 7, 0); +#endif + + /* Required by AVX512 sub/fma/add instructions */ + if ((exx[1] & CPU_FEATURE_AVX512F) != CPU_FEATURE_AVX512F) + return false; + + /* Required by _mm256_loadu_ph */ + if ((exx[1] & CPU_FEATURE_AVX512VL) != CPU_FEATURE_AVX512VL) + return false; + + /* Required by masked loads in remainder loops */ + if ((exx[1] & CPU_FEATURE_AVX512_BW) != CPU_FEATURE_AVX512_BW) + return false; + + return (exx[3] & CPU_FEATURE_AVX512_FP16) == CPU_FEATURE_AVX512_FP16; +} +#endif #endif void @@ -294,5 +680,19 @@ HalfvecInit(void) /* Does not require FMA, but keep logic simple */ HalfvecL1Distance = HalfvecL1DistanceF16c; } + +#ifdef HAVE_AVX512FP16 + if (SupportsAvx512Fp16()) + { + HalfvecL2SquaredDistance = HalfvecL2SquaredDistanceAvx512; + HalfvecInnerProduct = HalfvecInnerProductAvx512; + HalfvecCosineSimilarity = HalfvecCosineSimilarityAvx512; + HalfvecL1Distance = HalfvecL1DistanceAvx512; + } #endif +#endif + + DefineCustomBoolVariable("halfvec.use_fp16_compute", "Use FP16 computation for distance calculations", + "If true, distance calculations are executed in FP16. If false, distance calculations are executed in FP32", + &halfvec_use_fp16_compute, false, PGC_USERSET, 0, NULL, NULL, NULL); } diff --git a/src/halfutils.h b/src/halfutils.h index c684f72d7..99131d583 100644 --- a/src/halfutils.h +++ b/src/halfutils.h @@ -17,6 +17,9 @@ extern float (*HalfvecL1Distance) (int dim, half * ax, half * bx); void HalfvecInit(void); +/* Variables */ +extern bool halfvec_use_fp16_compute; + /* * Check if half is NaN */ diff --git a/test/expected/halfvec_functions_fp16.out b/test/expected/halfvec_functions_fp16.out new file mode 100644 index 000000000..a82a604b0 --- /dev/null +++ b/test/expected/halfvec_functions_fp16.out @@ -0,0 +1,141 @@ +SET halfvec.use_fp16_compute = true; +SELECT l2_distance('[0,0]'::halfvec, '[3,4]'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('[0,0]'::halfvec, '[0,1]'); + l2_distance +------------- + 1 +(1 row) + +SELECT l2_distance('[1,2]'::halfvec, '[3]'); +ERROR: different halfvec dimensions 2 and 1 +SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]'); + l2_distance +------------- + 5 +(1 row) + +SELECT '[0,0]'::halfvec <-> '[3,4]'; + ?column? +---------- + 5 +(1 row) + +SELECT inner_product('[1,2]'::halfvec, '[3,4]'); + inner_product +--------------- + 11 +(1 row) + +SELECT inner_product('[1,2]'::halfvec, '[3]'); +ERROR: different halfvec dimensions 2 and 1 +SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); + inner_product +--------------- + 45 +(1 row) + +SELECT '[1,2]'::halfvec <#> '[3,4]'; + ?column? +---------- + -11 +(1 row) + +SELECT cosine_distance('[1,2]'::halfvec, '[2,4]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,2]'::halfvec, '[0,0]'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('[1,1]'::halfvec, '[1,1]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,0]'::halfvec, '[0,2]'); + cosine_distance +----------------- + 1 +(1 row) + +SELECT cosine_distance('[1,1]'::halfvec, '[-1,-1]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('[1,2]'::halfvec, '[3]'); +ERROR: different halfvec dimensions 2 and 1 +SELECT cosine_distance('[1,1]'::halfvec, '[1.1,1.1]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT '[1,2]'::halfvec <=> '[2,4]'; + ?column? +---------- + 0 +(1 row) + +SELECT l1_distance('[0,0]'::halfvec, '[3,4]'); + l1_distance +------------- + 7 +(1 row) + +SELECT l1_distance('[0,0]'::halfvec, '[0,1]'); + l1_distance +------------- + 1 +(1 row) + +SELECT l1_distance('[1,2]'::halfvec, '[3]'); +ERROR: different halfvec dimensions 2 and 1 +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); + l1_distance +------------- + 0 +(1 row) + +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]'); + l1_distance +------------- + 9 +(1 row) + +SELECT '[0,0]'::halfvec <+> '[3,4]'; + ?column? +---------- + 7 +(1 row) + diff --git a/test/sql/halfvec_functions_fp16.sql b/test/sql/halfvec_functions_fp16.sql new file mode 100644 index 000000000..e930a840e --- /dev/null +++ b/test/sql/halfvec_functions_fp16.sql @@ -0,0 +1,31 @@ +SET halfvec.use_fp16_compute = true; +SELECT l2_distance('[0,0]'::halfvec, '[3,4]'); +SELECT l2_distance('[0,0]'::halfvec, '[0,1]'); +SELECT l2_distance('[1,2]'::halfvec, '[3]'); +SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]'); +SELECT '[0,0]'::halfvec <-> '[3,4]'; + +SELECT inner_product('[1,2]'::halfvec, '[3,4]'); +SELECT inner_product('[1,2]'::halfvec, '[3]'); +SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); +SELECT '[1,2]'::halfvec <#> '[3,4]'; + +SELECT cosine_distance('[1,2]'::halfvec, '[2,4]'); +SELECT cosine_distance('[1,2]'::halfvec, '[0,0]'); +SELECT cosine_distance('[1,1]'::halfvec, '[1,1]'); +SELECT cosine_distance('[1,0]'::halfvec, '[0,2]'); +SELECT cosine_distance('[1,1]'::halfvec, '[-1,-1]'); +SELECT cosine_distance('[1,2]'::halfvec, '[3]'); +SELECT cosine_distance('[1,1]'::halfvec, '[1.1,1.1]'); +SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]'); +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]'); +SELECT '[1,2]'::halfvec <=> '[2,4]'; + +SELECT l1_distance('[0,0]'::halfvec, '[3,4]'); +SELECT l1_distance('[0,0]'::halfvec, '[0,1]'); +SELECT l1_distance('[1,2]'::halfvec, '[3]'); +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]'); +SELECT '[0,0]'::halfvec <+> '[3,4]'; + From 38518c5cd6ef01e3e2433d03285b602e08279739 Mon Sep 17 00:00:00 2001 From: Luca Giacchino Date: Wed, 25 Sep 2024 14:33:58 -0700 Subject: [PATCH 2/5] Automate fp16/fp32 halfvec distance computation --- src/halfutils.c | 539 +++++++++++------------ src/halfutils.h | 3 - test/expected/halfvec.out | 66 +++ test/expected/halfvec_functions_fp16.out | 141 ------ test/sql/halfvec.sql | 11 + test/sql/halfvec_functions_fp16.sql | 31 -- 6 files changed, 343 insertions(+), 448 deletions(-) delete mode 100644 test/expected/halfvec_functions_fp16.out delete mode 100644 test/sql/halfvec_functions_fp16.sql diff --git a/src/halfutils.c b/src/halfutils.c index 9ef916803..4b4cf9d4b 100644 --- a/src/halfutils.c +++ b/src/halfutils.c @@ -2,7 +2,6 @@ #include "halfutils.h" #include "halfvec.h" -#include "utils/guc.h" #ifdef HALFVEC_DISPATCH #include @@ -24,12 +23,10 @@ #define TARGET_AVX512FP16 #else #define TARGET_F16C __attribute__((target("avx,f16c,fma"))) -#define TARGET_AVX512FP16 __attribute__((target("avx512fp16,avx512f,avx512vl,avx512bw"))) +#define TARGET_AVX512FP16 __attribute__((target("avx512fp16,avx512f,avx512dq,avx512vl,avx512bw"))) #endif #endif -bool halfvec_use_fp16_compute; - float (*HalfvecL2SquaredDistance) (int dim, half * ax, half * bx); float (*HalfvecInnerProduct) (int dim, half * ax, half * bx); double (*HalfvecCosineSimilarity) (int dim, half * ax, half * bx); @@ -87,82 +84,90 @@ HalfvecL2SquaredDistanceF16c(int dim, half * ax, half * bx) } #ifdef HAVE_AVX512FP16 -TARGET_AVX512FP16 static float -HalfvecL2SquaredDistanceAvx512Fp16(int dim, half * ax, half * bx) -{ - float distance; - int i; - int count = (dim / 32) * 32; - unsigned long mask; - __m512h axi; - __m512h bxi; - __m512h diff; - __m512h dist = _mm512_setzero_ph(); - - for (i = 0; i < count; i += 32) - { - axi = _mm512_loadu_ph(ax+i); - bxi = _mm512_loadu_ph(bx+i); - diff = _mm512_sub_ph(axi, bxi); - dist = _mm512_fmadd_ph(diff, diff, dist); - } - - mask = (1 << (dim - i)) - 1; - axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); - bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); - diff = _mm512_sub_ph(axi, bxi); - dist = _mm512_fmadd_ph(diff, diff, dist); - - distance = (float)_mm512_reduce_add_ph(dist); +TARGET_AVX512FP16 static inline bool +HasInfinity(__m512h val) { + /* Test for positive and negative infinity */ + __mmask32 mask = _mm512_fpclass_ph_mask(val, 0x08 + 0x10); + return mask != 0; +} - return distance; +TARGET_AVX512FP16 static inline __m512 +ConvertToFp32Sum(__m512h val) { + __m256h val_lower = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 0)); + __m256h val_upper = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 1)); + return _mm512_add_ps(_mm512_cvtxph_ps(val_lower), _mm512_cvtxph_ps(val_upper)); } TARGET_AVX512FP16 static float -HalfvecL2SquaredDistanceAvx512Fp32(int dim, half * ax, half * bx) +HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx) { float distance; int i; - int count = (dim / 16) * 16; unsigned long mask; - __m256h axi; - __m256h bxi; - __m512 axs; - __m512 bxs; - __m512 diff; - __m512 dist = _mm512_setzero_ps(); - - for (i = 0; i < count; i += 16) + + /* For FP16 computation */ + __m512h axi_512h; + __m512h bxi_512h; + __m512h diff_512h; + __m512h dist_512h = _mm512_setzero_ph(); + __m512h dist_512h_temp; + + /* For FP32 computation */ + __m256h axi_256h; + __m256h bxi_256h; + __m512 axi_512; + __m512 bxi_512; + __m512 diff_512; + __m512 dist_512; + + /* FP16 computation */ + for (i = 0; i < dim; i += 32) { - axi = _mm256_loadu_ph(ax+i); - bxi = _mm256_loadu_ph(bx+i); - axs = _mm512_cvtxph_ps(axi); - bxs = _mm512_cvtxph_ps(bxi); - diff = _mm512_sub_ps(axs, bxs); - dist = _mm512_fmadd_ps(diff, diff, dist); + if (dim - i < 32) + { + mask = (1 << (dim - i)) - 1; + axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_512h = _mm512_loadu_ph(ax + i); + bxi_512h = _mm512_loadu_ph(bx + i); + } + diff_512h = _mm512_sub_ph(axi_512h, bxi_512h); + dist_512h_temp = _mm512_fmadd_ph(diff_512h, diff_512h, dist_512h); + + /* if overflow, continue with FP32 */ + if (HasInfinity(dist_512h_temp)) + break; + else + dist_512h = dist_512h_temp; } - - mask = (1 << (dim - i)) - 1; - axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); - bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); - axs = _mm512_cvtxph_ps(axi); - bxs = _mm512_cvtxph_ps(bxi); - diff = _mm512_sub_ps(axs, bxs); - dist = _mm512_fmadd_ps(diff, diff, dist); + dist_512 = ConvertToFp32Sum(dist_512h); - distance = (float)_mm512_reduce_add_ps(dist); + /* FP32 computation */ + for (; i < dim; i += 16) + { + if (dim - i < 16) + { + mask = (1 << (dim - i)) - 1; + axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_256h = _mm256_loadu_ph(ax + i); + bxi_256h = _mm256_loadu_ph(bx + i); + } + axi_512 = _mm512_cvtxph_ps(axi_256h); + bxi_512 = _mm512_cvtxph_ps(bxi_256h); + diff_512 = _mm512_sub_ps(axi_512, bxi_512); + dist_512 = _mm512_fmadd_ps(diff_512, diff_512, dist_512); + } + distance = _mm512_reduce_add_ps(dist_512); return distance; } - -static float -HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx) -{ - if (halfvec_use_fp16_compute) - return HalfvecL2SquaredDistanceAvx512Fp16(dim, ax, bx); - else - return HalfvecL2SquaredDistanceAvx512Fp32(dim, ax, bx); -} #endif #endif @@ -210,75 +215,71 @@ HalfvecInnerProductF16c(int dim, half * ax, half * bx) #ifdef HAVE_AVX512FP16 TARGET_AVX512FP16 static float -HalfvecInnerProductAvx512Fp16(int dim, half * ax, half * bx) +HalfvecInnerProductAvx512(int dim, half * ax, half * bx) { float distance; int i; - int count = (dim / 32) * 32; - unsigned long mask; - __m512h axi; - __m512h bxi; - __m512h dist = _mm512_setzero_ph(); - - for (i = 0; i < count; i += 32) + unsigned int mask; + + /* For FP16 computation */ + __m512h axi_512h; + __m512h bxi_512h; + __m512h dist_512h = _mm512_setzero_ph(); + __m512h dist_512h_temp; + + /* For FP32 computation */ + __m256h axi_256h; + __m256h bxi_256h; + __m512 axi_512; + __m512 bxi_512; + __m512 dist_512; + + /* FP16 computation */ + for (i = 0; i < dim; i += 32) { - axi = _mm512_loadu_ph(ax+i); - bxi = _mm512_loadu_ph(bx+i); - dist = _mm512_fmadd_ph(axi, bxi, dist); + if (dim - i < 32) + { + mask = (1 << (dim - i)) - 1; + axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_512h = _mm512_loadu_ph(ax + i); + bxi_512h = _mm512_loadu_ph(bx + i); + } + dist_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, dist_512h); + + /* if overflow, continue with FP32 */ + if (HasInfinity(dist_512h_temp)) + break; + else + dist_512h = dist_512h_temp; } + dist_512 = ConvertToFp32Sum(dist_512h); - mask = (1 << (dim - i)) - 1; - axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); - bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); - dist = _mm512_fmadd_ph(axi, bxi, dist); - - distance = (float)_mm512_reduce_add_ph(dist); - - return distance; -} - -TARGET_AVX512FP16 static float -HalfvecInnerProductAvx512Fp32(int dim, half * ax, half * bx) -{ - float distance; - int i; - int count = (dim / 16) * 16; - unsigned long mask; - __m256h axi; - __m256h bxi; - __m512 axs; - __m512 bxs; - __m512 dist = _mm512_setzero_ps(); - - for (i = 0; i < count; i += 16) + /* FP32 computation */ + for (; i < dim; i += 16) { - axi = _mm256_loadu_ph(ax+i); - bxi = _mm256_loadu_ph(bx+i); - axs = _mm512_cvtxph_ps(axi); - bxs = _mm512_cvtxph_ps(bxi); - dist = _mm512_fmadd_ps(axs, bxs, dist); + if (dim - i < 16) + { + mask = (1 << (dim - i)) - 1; + axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_256h = _mm256_loadu_ph(ax + i); + bxi_256h = _mm256_loadu_ph(bx + i); + } + axi_512 = _mm512_cvtxph_ps(axi_256h); + bxi_512 = _mm512_cvtxph_ps(bxi_256h); + dist_512 = _mm512_fmadd_ps(axi_512, bxi_512, dist_512); } - mask = (1 << (dim - i)) - 1; - axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); - bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); - axs = _mm512_cvtxph_ps(axi); - bxs = _mm512_cvtxph_ps(bxi); - dist = _mm512_fmadd_ps(axs, bxs, dist); - - distance = (float)_mm512_reduce_add_ps(dist); - + distance = _mm512_reduce_add_ps(dist_512); return distance; } - -static float -HalfvecInnerProductAvx512(int dim, half * ax, half * bx) -{ - if (halfvec_use_fp16_compute) - return HalfvecInnerProductAvx512Fp16(dim, ax, bx); - else - return HalfvecInnerProductAvx512Fp32(dim, ax, bx); -} #endif #endif @@ -356,97 +357,93 @@ HalfvecCosineSimilarityF16c(int dim, half * ax, half * bx) #ifdef HAVE_AVX512FP16 TARGET_AVX512FP16 static double -HalfvecCosineSimilarityAvx512Fp16(int dim, half * ax, half * bx) +HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx) { float similarity; float norma; float normb; int i; - int count = (dim / 32) * 32; - unsigned long mask; - __m512h axi; - __m512h bxi; - __m512h sim = _mm512_setzero_ph(); - __m512h na = _mm512_setzero_ph(); - __m512h nb = _mm512_setzero_ph(); - - for (i = 0; i < count; i += 32) + unsigned int mask; + + /* For FP16 computation */ + __m512h axi_512h; + __m512h bxi_512h; + __m512h sim_512h = _mm512_setzero_ph(); + __m512h na_512h = _mm512_setzero_ph(); + __m512h nb_512h = _mm512_setzero_ph(); + __m512h sim_512h_temp; + __m512h na_512h_temp; + __m512h nb_512h_temp; + + /* For FP32 computation */ + __m256h axi_256h; + __m256h bxi_256h; + __m512 axi_512; + __m512 bxi_512; + __m512 sim_512; + __m512 na_512; + __m512 nb_512; + + /* FP16 computation */ + for (i = 0; i < dim; i += 32) { - axi = _mm512_loadu_ph(ax+i); - bxi = _mm512_loadu_ph(bx+i); - sim = _mm512_fmadd_ph(axi, bxi, sim); - na = _mm512_fmadd_ph(axi, axi, na); - nb = _mm512_fmadd_ph(bxi, bxi, nb); + if (dim - i < 32) { + mask = (1 << (dim - i)) - 1; + axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + } + else { + axi_512h = _mm512_loadu_ph(ax + i); + bxi_512h = _mm512_loadu_ph(bx + i); + } + sim_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, sim_512h); + na_512h_temp = _mm512_fmadd_ph(axi_512h, axi_512h, na_512h); + nb_512h_temp = _mm512_fmadd_ph(bxi_512h, bxi_512h, nb_512h); + + /* if overflow, continue with FP32 */ + if (HasInfinity(sim_512h_temp) || + HasInfinity(na_512h_temp) || + HasInfinity(nb_512h_temp)) + break; + else + { + sim_512h = sim_512h_temp; + na_512h = na_512h_temp; + nb_512h = nb_512h_temp; + } } + sim_512 = ConvertToFp32Sum(sim_512h); + na_512 = ConvertToFp32Sum(na_512h); + nb_512 = ConvertToFp32Sum(nb_512h); - mask = (1 << (dim - i)) - 1; - axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); - bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); - sim = _mm512_fmadd_ph(axi, bxi, sim); - na = _mm512_fmadd_ph(axi, axi, na); - nb = _mm512_fmadd_ph(bxi, bxi, nb); - - similarity = (float)_mm512_reduce_add_ph(sim); - norma = (float)_mm512_reduce_add_ph(na); - normb = (float)_mm512_reduce_add_ph(nb); - - /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ - return (double) similarity / sqrt((double) norma * (double) normb); -} - -TARGET_AVX512FP16 static double -HalfvecCosineSimilarityAvx512Fp32(int dim, half * ax, half * bx) -{ - float similarity; - float norma; - float normb; - int i; - int count = (dim / 16) * 16; - unsigned long mask; - __m256h axi; - __m256h bxi; - __m512 axs; - __m512 bxs; - __m512 sim = _mm512_setzero_ps(); - __m512 na = _mm512_setzero_ps(); - __m512 nb = _mm512_setzero_ps(); - - for (i = 0; i < count; i += 16) + /* FP32 computation */ + for (; i < dim; i += 16) { - axi = _mm256_loadu_ph(ax+i); - bxi = _mm256_loadu_ph(bx+i); - axs = _mm512_cvtxph_ps(axi); - bxs = _mm512_cvtxph_ps(bxi); - sim = _mm512_fmadd_ps(axs, bxs, sim); - na = _mm512_fmadd_ps(axs, axs, na); - nb = _mm512_fmadd_ps(bxs, bxs, nb); + if (dim - i < 16) + { + mask = (1 << (dim - i)) - 1; + axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_256h = _mm256_loadu_ph(ax + i); + bxi_256h = _mm256_loadu_ph(bx + i); + } + axi_512 = _mm512_cvtxph_ps(axi_256h); + bxi_512 = _mm512_cvtxph_ps(bxi_256h); + sim_512 = _mm512_fmadd_ps(axi_512, bxi_512, sim_512); + na_512 = _mm512_fmadd_ps(axi_512, axi_512, na_512); + nb_512 = _mm512_fmadd_ps(bxi_512, bxi_512, nb_512); } - mask = (1 << (dim - i)) - 1; - axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); - bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); - axs = _mm512_cvtxph_ps(axi); - bxs = _mm512_cvtxph_ps(bxi); - sim = _mm512_fmadd_ps(axs, bxs, sim); - na = _mm512_fmadd_ps(axs, axs, na); - nb = _mm512_fmadd_ps(bxs, bxs, nb); - - similarity = (float)_mm512_reduce_add_ps(sim); - norma = (float)_mm512_reduce_add_ps(na); - normb = (float)_mm512_reduce_add_ps(nb); + similarity = _mm512_reduce_add_ps(sim_512); + norma = _mm512_reduce_add_ps(na_512); + normb = _mm512_reduce_add_ps(nb_512); /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ return (double) similarity / sqrt((double) norma * (double) normb); } - -static double -HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx) -{ - if (halfvec_use_fp16_compute) - return HalfvecCosineSimilarityAvx512Fp16(dim, ax, bx); - else - return HalfvecCosineSimilarityAvx512Fp32(dim, ax, bx); -} #endif #endif @@ -496,75 +493,72 @@ HalfvecL1DistanceF16c(int dim, half * ax, half * bx) #ifdef HAVE_AVX512FP16 TARGET_AVX512FP16 static float -HalfvecL1DistanceAvx512Fp16(int dim, half * ax, half * bx) +HalfvecL1DistanceAvx512(int dim, half * ax, half * bx) { float distance; int i; - int count = (dim / 32) * 32; unsigned long mask; - __m512h axi; - __m512h bxi; - __m512h dist = _mm512_setzero_ph(); - for (i = 0; i < count; i += 32) + /* For FP16 computation */ + __m512h axi_512h; + __m512h bxi_512h; + __m512h dist_512h = _mm512_setzero_ph(); + __m512h dist_512h_temp; + + /* For FP32 computation */ + __m256h axi_256h; + __m256h bxi_256h; + __m512 axi_512; + __m512 bxi_512; + __m512 dist_512; + + /* FP16 computation */ + for (i = 0; i < dim; i += 32) { - axi = _mm512_loadu_ph(ax+i); - bxi = _mm512_loadu_ph(bx+i); - dist = _mm512_add_ph(dist, _mm512_abs_ph(_mm512_sub_ph(axi, bxi))); + if (dim - i < 32) + { + mask = (1 << (dim - i)) - 1; + axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_512h = _mm512_loadu_ph(ax + i); + bxi_512h = _mm512_loadu_ph(bx + i); + } + dist_512h_temp = _mm512_add_ph(dist_512h, _mm512_abs_ph(_mm512_sub_ph(axi_512h, bxi_512h))); + + /* if overflow, continue with FP32 */ + if (HasInfinity(dist_512h_temp)) + break; + else + dist_512h = dist_512h_temp; } + dist_512 = ConvertToFp32Sum(dist_512h); - mask = (1 << (dim - i)) - 1; - axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); - bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); - dist = _mm512_add_ph(dist, _mm512_abs_ph(_mm512_sub_ph(axi, bxi))); - - distance = (float)_mm512_reduce_add_ph(dist); - - return distance; -} - -TARGET_AVX512FP16 static float -HalfvecL1DistanceAvx512Fp32(int dim, half * ax, half * bx) -{ - float distance; - int i; - int count = (dim / 16) * 16; - unsigned long mask; - __m256h axi; - __m256h bxi; - __m512 axs; - __m512 bxs; - __m512 dist = _mm512_setzero_ps(); - - for (i = 0; i < count; i += 16) + /* FP32 computation */ + for (; i < dim; i += 16) { - axi = _mm256_loadu_ph(ax+i); - bxi = _mm256_loadu_ph(bx+i); - axs = _mm512_cvtxph_ps(axi); - bxs = _mm512_cvtxph_ps(bxi); - dist = _mm512_add_ps(dist, _mm512_abs_ps(_mm512_sub_ps(axs, bxs))); + if (dim - i < 16) + { + mask = (1 << (dim - i)) - 1; + axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_256h = _mm256_loadu_ph(ax + i); + bxi_256h = _mm256_loadu_ph(bx + i); + } + axi_512 = _mm512_cvtxph_ps(axi_256h); + bxi_512 = _mm512_cvtxph_ps(bxi_256h); + dist_512 = _mm512_add_ps(dist_512, _mm512_abs_ps(_mm512_sub_ps(axi_512, bxi_512))); } - mask = (1 << (dim - i)) - 1; - axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); - bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); - axs = _mm512_cvtxph_ps(axi); - bxs = _mm512_cvtxph_ps(bxi); - dist = _mm512_add_ps(dist, _mm512_abs_ps(_mm512_sub_ps(axs, bxs))); - - distance = (float)_mm512_reduce_add_ps(dist); + distance = _mm512_reduce_add_ps(dist_512); return distance; } - -static float -HalfvecL1DistanceAvx512(int dim, half * ax, half * bx) -{ - if (halfvec_use_fp16_compute) - return HalfvecL1DistanceAvx512Fp16(dim, ax, bx); - else - return HalfvecL1DistanceAvx512Fp32(dim, ax, bx); -} #endif #endif @@ -618,16 +612,28 @@ SupportsOsXsave() return (exx[2] & CPU_FEATURE_OSXSAVE) == CPU_FEATURE_OSXSAVE; } -#define CPU_FEATURE_AVX512F (1 << 16) -#define CPU_FEATURE_AVX512_FP16 (1 << 23) -#define CPU_FEATURE_AVX512_BW (1 << 30) -#define CPU_FEATURE_AVX512VL (1 << 31) +#define CPU_FEATURE_AVX512F (1 << 16) +#define CPU_FEATURE_AVX512DQ (1 << 17) +#define CPU_FEATURE_AVX512_FP16 (1 << 23) +#define CPU_FEATURE_AVX512BW (1 << 30) +#define CPU_FEATURE_AVX512VL (1 << 31) TARGET_XSAVE static bool SupportsAvx512Fp16() { unsigned int exx[4] = {0, 0, 0, 0}; + /* AVX512 features required: + * AVX512F : sub/fma/add instructions + * AVX512DQ: _mm512_extracti32x8_epi32 + * AVX512VL: _mm256_loadu_ph + * AVX512BW: masked loads + */ + unsigned int features = CPU_FEATURE_AVX512F | + CPU_FEATURE_AVX512DQ | + CPU_FEATURE_AVX512VL | + CPU_FEATURE_AVX512BW; + /* Check OS supports XSAVE */ if (!SupportsOsXsave()) return false; @@ -642,16 +648,7 @@ SupportsAvx512Fp16() __cpuid(exx, 7, 0); #endif - /* Required by AVX512 sub/fma/add instructions */ - if ((exx[1] & CPU_FEATURE_AVX512F) != CPU_FEATURE_AVX512F) - return false; - - /* Required by _mm256_loadu_ph */ - if ((exx[1] & CPU_FEATURE_AVX512VL) != CPU_FEATURE_AVX512VL) - return false; - - /* Required by masked loads in remainder loops */ - if ((exx[1] & CPU_FEATURE_AVX512_BW) != CPU_FEATURE_AVX512_BW) + if ((exx[1] & features) != features) return false; return (exx[3] & CPU_FEATURE_AVX512_FP16) == CPU_FEATURE_AVX512_FP16; @@ -691,8 +688,4 @@ HalfvecInit(void) } #endif #endif - - DefineCustomBoolVariable("halfvec.use_fp16_compute", "Use FP16 computation for distance calculations", - "If true, distance calculations are executed in FP16. If false, distance calculations are executed in FP32", - &halfvec_use_fp16_compute, false, PGC_USERSET, 0, NULL, NULL, NULL); } diff --git a/src/halfutils.h b/src/halfutils.h index 99131d583..c684f72d7 100644 --- a/src/halfutils.h +++ b/src/halfutils.h @@ -17,9 +17,6 @@ extern float (*HalfvecL1Distance) (int dim, half * ax, half * bx); void HalfvecInit(void); -/* Variables */ -extern bool halfvec_use_fp16_compute; - /* * Check if half is NaN */ diff --git a/test/expected/halfvec.out b/test/expected/halfvec.out index 3ef913d6c..c3856c512 100644 --- a/test/expected/halfvec.out +++ b/test/expected/halfvec.out @@ -378,6 +378,24 @@ SELECT '[0,0]'::halfvec <-> '[3,4]'; 5 (1 row) +SELECT l2_distance('[501]'::halfvec, '[1]'); + l2_distance +------------- + 500 +(1 row) + +SELECT l2_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); + l2_distance +------------- + 0 +(1 row) + +SELECT l2_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); + l2_distance +------------- + 6 +(1 row) + SELECT inner_product('[1,2]'::halfvec, '[3,4]'); inner_product --------------- @@ -404,6 +422,24 @@ SELECT '[1,2]'::halfvec <#> '[3,4]'; -11 (1 row) +SELECT inner_product('[50000,50000]'::halfvec, '[1,1]'); + inner_product +--------------- + 99968 +(1 row) + +SELECT inner_product('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); + inner_product +--------------- + 36 +(1 row) + +SELECT inner_product('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); + inner_product +--------------- + 72 +(1 row) + SELECT cosine_distance('[1,2]'::halfvec, '[2,4]'); cosine_distance ----------------- @@ -466,6 +502,24 @@ SELECT '[1,2]'::halfvec <=> '[2,4]'; 0 (1 row) +SELECT cosine_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]'::halfvec, '[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]'); + cosine_distance +----------------- + 1 +(1 row) + SELECT l1_distance('[0,0]'::halfvec, '[3,4]'); l1_distance ------------- @@ -498,6 +552,18 @@ SELECT '[0,0]'::halfvec <+> '[3,4]'; 7 (1 row) +SELECT l1_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); + l1_distance +------------- + 0 +(1 row) + +SELECT l1_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); + l1_distance +------------- + 36 +(1 row) + SELECT l2_normalize('[3,4]'::halfvec); l2_normalize ------------------------ diff --git a/test/expected/halfvec_functions_fp16.out b/test/expected/halfvec_functions_fp16.out deleted file mode 100644 index a82a604b0..000000000 --- a/test/expected/halfvec_functions_fp16.out +++ /dev/null @@ -1,141 +0,0 @@ -SET halfvec.use_fp16_compute = true; -SELECT l2_distance('[0,0]'::halfvec, '[3,4]'); - l2_distance -------------- - 5 -(1 row) - -SELECT l2_distance('[0,0]'::halfvec, '[0,1]'); - l2_distance -------------- - 1 -(1 row) - -SELECT l2_distance('[1,2]'::halfvec, '[3]'); -ERROR: different halfvec dimensions 2 and 1 -SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]'); - l2_distance -------------- - 5 -(1 row) - -SELECT '[0,0]'::halfvec <-> '[3,4]'; - ?column? ----------- - 5 -(1 row) - -SELECT inner_product('[1,2]'::halfvec, '[3,4]'); - inner_product ---------------- - 11 -(1 row) - -SELECT inner_product('[1,2]'::halfvec, '[3]'); -ERROR: different halfvec dimensions 2 and 1 -SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); - inner_product ---------------- - 45 -(1 row) - -SELECT '[1,2]'::halfvec <#> '[3,4]'; - ?column? ----------- - -11 -(1 row) - -SELECT cosine_distance('[1,2]'::halfvec, '[2,4]'); - cosine_distance ------------------ - 0 -(1 row) - -SELECT cosine_distance('[1,2]'::halfvec, '[0,0]'); - cosine_distance ------------------ - NaN -(1 row) - -SELECT cosine_distance('[1,1]'::halfvec, '[1,1]'); - cosine_distance ------------------ - 0 -(1 row) - -SELECT cosine_distance('[1,0]'::halfvec, '[0,2]'); - cosine_distance ------------------ - 1 -(1 row) - -SELECT cosine_distance('[1,1]'::halfvec, '[-1,-1]'); - cosine_distance ------------------ - 2 -(1 row) - -SELECT cosine_distance('[1,2]'::halfvec, '[3]'); -ERROR: different halfvec dimensions 2 and 1 -SELECT cosine_distance('[1,1]'::halfvec, '[1.1,1.1]'); - cosine_distance ------------------ - 0 -(1 row) - -SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]'); - cosine_distance ------------------ - 2 -(1 row) - -SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); - cosine_distance ------------------ - 0 -(1 row) - -SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]'); - cosine_distance ------------------ - 2 -(1 row) - -SELECT '[1,2]'::halfvec <=> '[2,4]'; - ?column? ----------- - 0 -(1 row) - -SELECT l1_distance('[0,0]'::halfvec, '[3,4]'); - l1_distance -------------- - 7 -(1 row) - -SELECT l1_distance('[0,0]'::halfvec, '[0,1]'); - l1_distance -------------- - 1 -(1 row) - -SELECT l1_distance('[1,2]'::halfvec, '[3]'); -ERROR: different halfvec dimensions 2 and 1 -SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); - l1_distance -------------- - 0 -(1 row) - -SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]'); - l1_distance -------------- - 9 -(1 row) - -SELECT '[0,0]'::halfvec <+> '[3,4]'; - ?column? ----------- - 7 -(1 row) - diff --git a/test/sql/halfvec.sql b/test/sql/halfvec.sql index 592a6e38a..744d03868 100644 --- a/test/sql/halfvec.sql +++ b/test/sql/halfvec.sql @@ -87,12 +87,18 @@ SELECT l2_distance('[0,0]'::halfvec, '[0,1]'); SELECT l2_distance('[1,2]'::halfvec, '[3]'); SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]'); SELECT '[0,0]'::halfvec <-> '[3,4]'; +SELECT l2_distance('[501]'::halfvec, '[1]'); +SELECT l2_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); +SELECT l2_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); SELECT inner_product('[1,2]'::halfvec, '[3,4]'); SELECT inner_product('[1,2]'::halfvec, '[3]'); SELECT inner_product('[65504]'::halfvec, '[65504]'); SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); SELECT '[1,2]'::halfvec <#> '[3,4]'; +SELECT inner_product('[50000,50000]'::halfvec, '[1,1]'); +SELECT inner_product('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); +SELECT inner_product('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); SELECT cosine_distance('[1,2]'::halfvec, '[2,4]'); SELECT cosine_distance('[1,2]'::halfvec, '[0,0]'); @@ -105,6 +111,9 @@ SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]'); SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]'); SELECT '[1,2]'::halfvec <=> '[2,4]'; +SELECT cosine_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); +SELECT cosine_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); +SELECT cosine_distance('[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]'::halfvec, '[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]'); SELECT l1_distance('[0,0]'::halfvec, '[3,4]'); SELECT l1_distance('[0,0]'::halfvec, '[0,1]'); @@ -112,6 +121,8 @@ SELECT l1_distance('[1,2]'::halfvec, '[3]'); SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]'); SELECT '[0,0]'::halfvec <+> '[3,4]'; +SELECT l1_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); +SELECT l1_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'); SELECT l2_normalize('[3,4]'::halfvec); SELECT l2_normalize('[3,0]'::halfvec); diff --git a/test/sql/halfvec_functions_fp16.sql b/test/sql/halfvec_functions_fp16.sql deleted file mode 100644 index e930a840e..000000000 --- a/test/sql/halfvec_functions_fp16.sql +++ /dev/null @@ -1,31 +0,0 @@ -SET halfvec.use_fp16_compute = true; -SELECT l2_distance('[0,0]'::halfvec, '[3,4]'); -SELECT l2_distance('[0,0]'::halfvec, '[0,1]'); -SELECT l2_distance('[1,2]'::halfvec, '[3]'); -SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]'); -SELECT '[0,0]'::halfvec <-> '[3,4]'; - -SELECT inner_product('[1,2]'::halfvec, '[3,4]'); -SELECT inner_product('[1,2]'::halfvec, '[3]'); -SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); -SELECT '[1,2]'::halfvec <#> '[3,4]'; - -SELECT cosine_distance('[1,2]'::halfvec, '[2,4]'); -SELECT cosine_distance('[1,2]'::halfvec, '[0,0]'); -SELECT cosine_distance('[1,1]'::halfvec, '[1,1]'); -SELECT cosine_distance('[1,0]'::halfvec, '[0,2]'); -SELECT cosine_distance('[1,1]'::halfvec, '[-1,-1]'); -SELECT cosine_distance('[1,2]'::halfvec, '[3]'); -SELECT cosine_distance('[1,1]'::halfvec, '[1.1,1.1]'); -SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]'); -SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); -SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]'); -SELECT '[1,2]'::halfvec <=> '[2,4]'; - -SELECT l1_distance('[0,0]'::halfvec, '[3,4]'); -SELECT l1_distance('[0,0]'::halfvec, '[0,1]'); -SELECT l1_distance('[1,2]'::halfvec, '[3]'); -SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); -SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]'); -SELECT '[0,0]'::halfvec <+> '[3,4]'; - From 9e35aa8175f5cd7b56883cb00758177be5e67870 Mon Sep 17 00:00:00 2001 From: Luca Giacchino Date: Mon, 28 Oct 2024 13:50:32 -0700 Subject: [PATCH 3/5] Implement AVX512 vector_to_halfvec conversion --- src/halfutils.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++ src/halfutils.h | 3 +++ src/halfvec.c | 3 +-- 3 files changed, 69 insertions(+), 2 deletions(-) diff --git a/src/halfutils.c b/src/halfutils.c index 4b4cf9d4b..e6048bab1 100644 --- a/src/halfutils.c +++ b/src/halfutils.c @@ -32,6 +32,8 @@ float (*HalfvecInnerProduct) (int dim, half * ax, half * bx); double (*HalfvecCosineSimilarity) (int dim, half * ax, half * bx); float (*HalfvecL1Distance) (int dim, half * ax, half * bx); +void (*Float4ToHalfVector) (Vector * vec, HalfVector * result); + static float HalfvecL2SquaredDistanceDefault(int dim, half * ax, half * bx) { @@ -562,6 +564,67 @@ HalfvecL1DistanceAvx512(int dim, half * ax, half * bx) #endif #endif +static void +Float4ToHalfVectorDefault(Vector * vec, HalfVector * result) { + for (int i = 0; i < vec->dim; i++) + result->x[i] = Float4ToHalf(vec->x[i]); +} + +#ifdef HAVE_AVX512FP16 +TARGET_AVX512FP16 static void +Float4ToHalfVectorAvx512(Vector * vec, HalfVector * result) { + unsigned long mask; + __m512 vec_512; + __m256h vec_256h; + __mmask16 vec_512_inf; + __mmask16 vec_256h_inf; + + for (int i = 0; i < vec->dim; i += 16) + { + if (vec->dim - i < 16) + { + mask = (1 << (vec->dim - i)) - 1; + vec_512 = _mm512_maskz_loadu_ps(mask, vec->x + i); + vec_256h = _mm512_cvtxps_ph(vec_512); + _mm256_mask_storeu_epi16(result->x + i, mask, _mm256_castph_si256(vec_256h)); + } + else + { + vec_512 = _mm512_loadu_ps(vec->x + i); + vec_256h = _mm512_cvtxps_ph(vec_512); + _mm256_storeu_ph(result->x + i, vec_256h); + } + + /* Test for positive and negative infinity */ + vec_512_inf = _mm512_fpclass_ps_mask(vec_512, 0x08 + 0x10); + vec_256h_inf = _mm256_fpclass_ph_mask(vec_256h, 0x08 + 0x10); + if (unlikely(vec_512_inf != vec_256h_inf)) + { + float num; + char* buf; + + __mmask16 diff = _kxor_mask16(vec_512_inf, vec_256h_inf); + /* Find first element in vector to overflow after conversion (first bit set) */ + int count = 0; + while (diff % 2 == 0) { + diff >>= 1; + count++; + } + num = vec->x[i + count]; + + /* TODO Avoid duplicate code in Float4ToHalf */ + buf = palloc(FLOAT_SHORTEST_DECIMAL_LEN); + + float_to_shortest_decimal_buf(num, buf); + + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"%s\" is out of range for type halfvec", buf))); + } + } +} +#endif + #ifdef HALFVEC_DISPATCH #define CPU_FEATURE_FMA (1 << 12) #define CPU_FEATURE_OSXSAVE (1 << 27) @@ -667,6 +730,7 @@ HalfvecInit(void) HalfvecInnerProduct = HalfvecInnerProductDefault; HalfvecCosineSimilarity = HalfvecCosineSimilarityDefault; HalfvecL1Distance = HalfvecL1DistanceDefault; + Float4ToHalfVector = Float4ToHalfVectorDefault; #ifdef HALFVEC_DISPATCH if (SupportsCpuFeature(CPU_FEATURE_AVX | CPU_FEATURE_F16C | CPU_FEATURE_FMA)) @@ -685,6 +749,7 @@ HalfvecInit(void) HalfvecInnerProduct = HalfvecInnerProductAvx512; HalfvecCosineSimilarity = HalfvecCosineSimilarityAvx512; HalfvecL1Distance = HalfvecL1DistanceAvx512; + Float4ToHalfVector = Float4ToHalfVectorAvx512; } #endif #endif diff --git a/src/halfutils.h b/src/halfutils.h index c684f72d7..1a33d1748 100644 --- a/src/halfutils.h +++ b/src/halfutils.h @@ -5,6 +5,7 @@ #include "common/shortest_dec.h" #include "halfvec.h" +#include "vector.h" #ifdef F16C_SUPPORT #include @@ -15,6 +16,8 @@ extern float (*HalfvecInnerProduct) (int dim, half * ax, half * bx); extern double (*HalfvecCosineSimilarity) (int dim, half * ax, half * bx); extern float (*HalfvecL1Distance) (int dim, half * ax, half * bx); +extern void (*Float4ToHalfVector) (Vector * vec, HalfVector * result); + void HalfvecInit(void); /* diff --git a/src/halfvec.c b/src/halfvec.c index 6b926e1ee..f9a42a559 100644 --- a/src/halfvec.c +++ b/src/halfvec.c @@ -533,8 +533,7 @@ vector_to_halfvec(PG_FUNCTION_ARGS) result = InitHalfVector(vec->dim); - for (int i = 0; i < vec->dim; i++) - result->x[i] = Float4ToHalf(vec->x[i]); + Float4ToHalfVector(vec, result); PG_RETURN_POINTER(result); } From e93f55eeec9c976e4aa826e67a44e21b5e6bb28e Mon Sep 17 00:00:00 2001 From: Luca Giacchino Date: Tue, 26 Nov 2024 19:08:16 -0800 Subject: [PATCH 4/5] Add CI job for AVX512 FP16 optimized path --- .github/workflows/build.yml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 50e65a816..9f541f131 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -41,6 +41,38 @@ jobs: sudo apt-get update sudo apt-get install libipc-run-perl - run: make prove_installcheck + ubuntu_spr: + runs-on: ubuntu-24.04 + if: ${{ !startsWith(github.ref_name, 'mac') && !startsWith(github.ref_name, 'windows') }} + steps: + - uses: actions/checkout@v4 + - name: Install Intel SDE + run: | + curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/831748/sde-external-9.44.0-2024-08-22-lin.tar.xz + mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ + sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde + - name: Install Postgres + run: | + sudo apt-get install postgresql-16 + sudo apt-get install postgresql-server-dev-16 + sudo systemctl stop postgresql + pgdir=$(pg_config --bindir) + pgdata=/tmp/postgres_data + mkdir $pgdata + $pgdir/initdb -D $pgdata --username=$USER + sudo chmod 777 /var/run/postgresql/ + sde -spr -mix -- $pgdir/pg_ctl -D $pgdata start + - run: make + env: + PG_CFLAGS: -DUSE_ASSERT_CHECKING -Wall -Wextra -Werror -Wno-unused-parameter -Wno-sign-compare + - run: | + export PG_CONFIG=`which pg_config` + sudo --preserve-env=PG_CONFIG make install + - run: sde -spr -mix -- make installcheck + - if: ${{ failure() }} + run: cat regression.diffs + - name: Report AVX512 FP16 FMA instruction count + run: cat sde*.txt | grep -E "FMA.*PH" mac: runs-on: ${{ matrix.os }} if: ${{ !startsWith(github.ref_name, 'windows') }} From f7cf963035c7a27efaa4120484f46a8718c4359f Mon Sep 17 00:00:00 2001 From: Luca Giacchino Date: Tue, 25 Mar 2025 13:41:38 -0700 Subject: [PATCH 5/5] Move AVX-512 functions to separate files --- Makefile | 6 + src/halfutils.c | 440 +--------------------------------------- src/halfutils_avx512.c | 449 +++++++++++++++++++++++++++++++++++++++++ src/halfutils_avx512.h | 25 +++ 4 files changed, 482 insertions(+), 438 deletions(-) create mode 100644 src/halfutils_avx512.c create mode 100644 src/halfutils_avx512.h diff --git a/Makefile b/Makefile index d98d73b8c..e89ee1342 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,9 @@ MODULE_big = vector DATA = $(wildcard sql/*--*--*.sql) DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o +ifneq ($(USE_AVX512), 0) + OBJS += src/halfutils_avx512.o +endif HEADERS = src/halfvec.h src/sparsevec.h src/vector.h TESTS = $(wildcard test/sql/*.sql) @@ -31,6 +34,9 @@ endif # - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html # - Clang (could use pragma instead) - https://llvm.org/docs/Vectorizers.html PG_CFLAGS += $(OPTFLAGS) -ftree-vectorize -fassociative-math -fno-signed-zeros -fno-trapping-math +ifneq ($(USE_AVX512), 0) + PG_CFLAGS += -DUSE_AVX512 +endif # Debug GCC auto-vectorization # PG_CFLAGS += -fopt-info-vec diff --git a/src/halfutils.c b/src/halfutils.c index e6048bab1..bdc370a19 100644 --- a/src/halfutils.c +++ b/src/halfutils.c @@ -5,6 +5,7 @@ #ifdef HALFVEC_DISPATCH #include +#include "halfutils_avx512.h" #if defined(USE__GET_CPUID) #include @@ -12,18 +13,10 @@ #include #endif -#if (defined(__GNUC__) && (__GNUC__ >= 12)) || \ - (defined(__clang__) && (__clang_major__ >= 16)) || \ - (defined __AVX512FP16__) -#define HAVE_AVX512FP16 -#endif - #ifdef _MSC_VER #define TARGET_F16C -#define TARGET_AVX512FP16 #else #define TARGET_F16C __attribute__((target("avx,f16c,fma"))) -#define TARGET_AVX512FP16 __attribute__((target("avx512fp16,avx512f,avx512dq,avx512vl,avx512bw"))) #endif #endif @@ -84,93 +77,6 @@ HalfvecL2SquaredDistanceF16c(int dim, half * ax, half * bx) return distance; } - -#ifdef HAVE_AVX512FP16 -TARGET_AVX512FP16 static inline bool -HasInfinity(__m512h val) { - /* Test for positive and negative infinity */ - __mmask32 mask = _mm512_fpclass_ph_mask(val, 0x08 + 0x10); - return mask != 0; -} - -TARGET_AVX512FP16 static inline __m512 -ConvertToFp32Sum(__m512h val) { - __m256h val_lower = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 0)); - __m256h val_upper = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 1)); - return _mm512_add_ps(_mm512_cvtxph_ps(val_lower), _mm512_cvtxph_ps(val_upper)); -} - -TARGET_AVX512FP16 static float -HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx) -{ - float distance; - int i; - unsigned long mask; - - /* For FP16 computation */ - __m512h axi_512h; - __m512h bxi_512h; - __m512h diff_512h; - __m512h dist_512h = _mm512_setzero_ph(); - __m512h dist_512h_temp; - - /* For FP32 computation */ - __m256h axi_256h; - __m256h bxi_256h; - __m512 axi_512; - __m512 bxi_512; - __m512 diff_512; - __m512 dist_512; - - /* FP16 computation */ - for (i = 0; i < dim; i += 32) - { - if (dim - i < 32) - { - mask = (1 << (dim - i)) - 1; - axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); - bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); - } - else - { - axi_512h = _mm512_loadu_ph(ax + i); - bxi_512h = _mm512_loadu_ph(bx + i); - } - diff_512h = _mm512_sub_ph(axi_512h, bxi_512h); - dist_512h_temp = _mm512_fmadd_ph(diff_512h, diff_512h, dist_512h); - - /* if overflow, continue with FP32 */ - if (HasInfinity(dist_512h_temp)) - break; - else - dist_512h = dist_512h_temp; - } - dist_512 = ConvertToFp32Sum(dist_512h); - - /* FP32 computation */ - for (; i < dim; i += 16) - { - if (dim - i < 16) - { - mask = (1 << (dim - i)) - 1; - axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); - bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); - } - else - { - axi_256h = _mm256_loadu_ph(ax + i); - bxi_256h = _mm256_loadu_ph(bx + i); - } - axi_512 = _mm512_cvtxph_ps(axi_256h); - bxi_512 = _mm512_cvtxph_ps(bxi_256h); - diff_512 = _mm512_sub_ps(axi_512, bxi_512); - dist_512 = _mm512_fmadd_ps(diff_512, diff_512, dist_512); - } - - distance = _mm512_reduce_add_ps(dist_512); - return distance; -} -#endif #endif static float @@ -214,75 +120,6 @@ HalfvecInnerProductF16c(int dim, half * ax, half * bx) return distance; } - -#ifdef HAVE_AVX512FP16 -TARGET_AVX512FP16 static float -HalfvecInnerProductAvx512(int dim, half * ax, half * bx) -{ - float distance; - int i; - unsigned int mask; - - /* For FP16 computation */ - __m512h axi_512h; - __m512h bxi_512h; - __m512h dist_512h = _mm512_setzero_ph(); - __m512h dist_512h_temp; - - /* For FP32 computation */ - __m256h axi_256h; - __m256h bxi_256h; - __m512 axi_512; - __m512 bxi_512; - __m512 dist_512; - - /* FP16 computation */ - for (i = 0; i < dim; i += 32) - { - if (dim - i < 32) - { - mask = (1 << (dim - i)) - 1; - axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); - bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); - } - else - { - axi_512h = _mm512_loadu_ph(ax + i); - bxi_512h = _mm512_loadu_ph(bx + i); - } - dist_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, dist_512h); - - /* if overflow, continue with FP32 */ - if (HasInfinity(dist_512h_temp)) - break; - else - dist_512h = dist_512h_temp; - } - dist_512 = ConvertToFp32Sum(dist_512h); - - /* FP32 computation */ - for (; i < dim; i += 16) - { - if (dim - i < 16) - { - mask = (1 << (dim - i)) - 1; - axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); - bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); - } - else - { - axi_256h = _mm256_loadu_ph(ax + i); - bxi_256h = _mm256_loadu_ph(bx + i); - } - axi_512 = _mm512_cvtxph_ps(axi_256h); - bxi_512 = _mm512_cvtxph_ps(bxi_256h); - dist_512 = _mm512_fmadd_ps(axi_512, bxi_512, dist_512); - } - - distance = _mm512_reduce_add_ps(dist_512); - return distance; -} -#endif #endif static double @@ -356,97 +193,6 @@ HalfvecCosineSimilarityF16c(int dim, half * ax, half * bx) /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ return (double) similarity / sqrt((double) norma * (double) normb); } - -#ifdef HAVE_AVX512FP16 -TARGET_AVX512FP16 static double -HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx) -{ - float similarity; - float norma; - float normb; - int i; - unsigned int mask; - - /* For FP16 computation */ - __m512h axi_512h; - __m512h bxi_512h; - __m512h sim_512h = _mm512_setzero_ph(); - __m512h na_512h = _mm512_setzero_ph(); - __m512h nb_512h = _mm512_setzero_ph(); - __m512h sim_512h_temp; - __m512h na_512h_temp; - __m512h nb_512h_temp; - - /* For FP32 computation */ - __m256h axi_256h; - __m256h bxi_256h; - __m512 axi_512; - __m512 bxi_512; - __m512 sim_512; - __m512 na_512; - __m512 nb_512; - - /* FP16 computation */ - for (i = 0; i < dim; i += 32) - { - if (dim - i < 32) { - mask = (1 << (dim - i)) - 1; - axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); - bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); - } - else { - axi_512h = _mm512_loadu_ph(ax + i); - bxi_512h = _mm512_loadu_ph(bx + i); - } - sim_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, sim_512h); - na_512h_temp = _mm512_fmadd_ph(axi_512h, axi_512h, na_512h); - nb_512h_temp = _mm512_fmadd_ph(bxi_512h, bxi_512h, nb_512h); - - /* if overflow, continue with FP32 */ - if (HasInfinity(sim_512h_temp) || - HasInfinity(na_512h_temp) || - HasInfinity(nb_512h_temp)) - break; - else - { - sim_512h = sim_512h_temp; - na_512h = na_512h_temp; - nb_512h = nb_512h_temp; - } - } - sim_512 = ConvertToFp32Sum(sim_512h); - na_512 = ConvertToFp32Sum(na_512h); - nb_512 = ConvertToFp32Sum(nb_512h); - - /* FP32 computation */ - for (; i < dim; i += 16) - { - if (dim - i < 16) - { - mask = (1 << (dim - i)) - 1; - axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); - bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); - } - else - { - axi_256h = _mm256_loadu_ph(ax + i); - bxi_256h = _mm256_loadu_ph(bx + i); - } - axi_512 = _mm512_cvtxph_ps(axi_256h); - bxi_512 = _mm512_cvtxph_ps(bxi_256h); - sim_512 = _mm512_fmadd_ps(axi_512, bxi_512, sim_512); - na_512 = _mm512_fmadd_ps(axi_512, axi_512, na_512); - nb_512 = _mm512_fmadd_ps(bxi_512, bxi_512, nb_512); - } - - similarity = _mm512_reduce_add_ps(sim_512); - norma = _mm512_reduce_add_ps(na_512); - normb = _mm512_reduce_add_ps(nb_512); - - /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ - return (double) similarity / sqrt((double) norma * (double) normb); -} -#endif #endif static float @@ -492,76 +238,6 @@ HalfvecL1DistanceF16c(int dim, half * ax, half * bx) return distance; } - -#ifdef HAVE_AVX512FP16 -TARGET_AVX512FP16 static float -HalfvecL1DistanceAvx512(int dim, half * ax, half * bx) -{ - float distance; - int i; - unsigned long mask; - - /* For FP16 computation */ - __m512h axi_512h; - __m512h bxi_512h; - __m512h dist_512h = _mm512_setzero_ph(); - __m512h dist_512h_temp; - - /* For FP32 computation */ - __m256h axi_256h; - __m256h bxi_256h; - __m512 axi_512; - __m512 bxi_512; - __m512 dist_512; - - /* FP16 computation */ - for (i = 0; i < dim; i += 32) - { - if (dim - i < 32) - { - mask = (1 << (dim - i)) - 1; - axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); - bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); - } - else - { - axi_512h = _mm512_loadu_ph(ax + i); - bxi_512h = _mm512_loadu_ph(bx + i); - } - dist_512h_temp = _mm512_add_ph(dist_512h, _mm512_abs_ph(_mm512_sub_ph(axi_512h, bxi_512h))); - - /* if overflow, continue with FP32 */ - if (HasInfinity(dist_512h_temp)) - break; - else - dist_512h = dist_512h_temp; - } - dist_512 = ConvertToFp32Sum(dist_512h); - - /* FP32 computation */ - for (; i < dim; i += 16) - { - if (dim - i < 16) - { - mask = (1 << (dim - i)) - 1; - axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); - bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); - } - else - { - axi_256h = _mm256_loadu_ph(ax + i); - bxi_256h = _mm256_loadu_ph(bx + i); - } - axi_512 = _mm512_cvtxph_ps(axi_256h); - bxi_512 = _mm512_cvtxph_ps(bxi_256h); - dist_512 = _mm512_add_ps(dist_512, _mm512_abs_ps(_mm512_sub_ps(axi_512, bxi_512))); - } - - distance = _mm512_reduce_add_ps(dist_512); - - return distance; -} -#endif #endif static void @@ -570,60 +246,6 @@ Float4ToHalfVectorDefault(Vector * vec, HalfVector * result) { result->x[i] = Float4ToHalf(vec->x[i]); } -#ifdef HAVE_AVX512FP16 -TARGET_AVX512FP16 static void -Float4ToHalfVectorAvx512(Vector * vec, HalfVector * result) { - unsigned long mask; - __m512 vec_512; - __m256h vec_256h; - __mmask16 vec_512_inf; - __mmask16 vec_256h_inf; - - for (int i = 0; i < vec->dim; i += 16) - { - if (vec->dim - i < 16) - { - mask = (1 << (vec->dim - i)) - 1; - vec_512 = _mm512_maskz_loadu_ps(mask, vec->x + i); - vec_256h = _mm512_cvtxps_ph(vec_512); - _mm256_mask_storeu_epi16(result->x + i, mask, _mm256_castph_si256(vec_256h)); - } - else - { - vec_512 = _mm512_loadu_ps(vec->x + i); - vec_256h = _mm512_cvtxps_ph(vec_512); - _mm256_storeu_ph(result->x + i, vec_256h); - } - - /* Test for positive and negative infinity */ - vec_512_inf = _mm512_fpclass_ps_mask(vec_512, 0x08 + 0x10); - vec_256h_inf = _mm256_fpclass_ph_mask(vec_256h, 0x08 + 0x10); - if (unlikely(vec_512_inf != vec_256h_inf)) - { - float num; - char* buf; - - __mmask16 diff = _kxor_mask16(vec_512_inf, vec_256h_inf); - /* Find first element in vector to overflow after conversion (first bit set) */ - int count = 0; - while (diff % 2 == 0) { - diff >>= 1; - count++; - } - num = vec->x[i + count]; - - /* TODO Avoid duplicate code in Float4ToHalf */ - buf = palloc(FLOAT_SHORTEST_DECIMAL_LEN); - - float_to_shortest_decimal_buf(num, buf); - - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("\"%s\" is out of range for type halfvec", buf))); - } - } -} -#endif #ifdef HALFVEC_DISPATCH #define CPU_FEATURE_FMA (1 << 12) @@ -659,64 +281,6 @@ SupportsCpuFeature(unsigned int feature) /* Now check features */ return (exx[2] & feature) == feature; } - -#ifdef HAVE_AVX512FP16 -TARGET_XSAVE static bool -SupportsOsXsave() -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#else - __cpuid(exx, 1); -#endif - - return (exx[2] & CPU_FEATURE_OSXSAVE) == CPU_FEATURE_OSXSAVE; -} - -#define CPU_FEATURE_AVX512F (1 << 16) -#define CPU_FEATURE_AVX512DQ (1 << 17) -#define CPU_FEATURE_AVX512_FP16 (1 << 23) -#define CPU_FEATURE_AVX512BW (1 << 30) -#define CPU_FEATURE_AVX512VL (1 << 31) - -TARGET_XSAVE static bool -SupportsAvx512Fp16() -{ - unsigned int exx[4] = {0, 0, 0, 0}; - - /* AVX512 features required: - * AVX512F : sub/fma/add instructions - * AVX512DQ: _mm512_extracti32x8_epi32 - * AVX512VL: _mm256_loadu_ph - * AVX512BW: masked loads - */ - unsigned int features = CPU_FEATURE_AVX512F | - CPU_FEATURE_AVX512DQ | - CPU_FEATURE_AVX512VL | - CPU_FEATURE_AVX512BW; - - /* Check OS supports XSAVE */ - if (!SupportsOsXsave()) - return false; - - /* Check XMM, YMM, and ZMM registers are enabled */ - if ((_xgetbv(0) & 0xe6) != 0xe6) - return false; - -#if defined(HAVE__GET_CPUID) - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 7, 0); -#endif - - if ((exx[1] & features) != features) - return false; - - return (exx[3] & CPU_FEATURE_AVX512_FP16) == CPU_FEATURE_AVX512_FP16; -} -#endif #endif void @@ -742,7 +306,7 @@ HalfvecInit(void) HalfvecL1Distance = HalfvecL1DistanceF16c; } -#ifdef HAVE_AVX512FP16 +#if defined(USE_AVX512) && defined(HAVE_AVX512FP16) if (SupportsAvx512Fp16()) { HalfvecL2SquaredDistance = HalfvecL2SquaredDistanceAvx512; diff --git a/src/halfutils_avx512.c b/src/halfutils_avx512.c new file mode 100644 index 000000000..27d907781 --- /dev/null +++ b/src/halfutils_avx512.c @@ -0,0 +1,449 @@ +#ifdef USE_AVX512 +#include "halfutils_avx512.h" + +#ifdef HAVE_AVX512FP16 +#include "common/shortest_dec.h" + +#include +#include + +#if defined(USE__GET_CPUID) +#include +#else +#include +#endif + +#ifdef _MSC_VER +#define TARGET_AVX512FP16 +#else +#define TARGET_AVX512FP16 __attribute__((target("avx512fp16,avx512f,avx512dq,avx512vl,avx512bw"))) +#endif + +TARGET_AVX512FP16 static inline bool +HasInfinity(__m512h val) { + /* Test for positive and negative infinity */ + __mmask32 mask = _mm512_fpclass_ph_mask(val, 0x08 + 0x10); + return mask != 0; +} + +TARGET_AVX512FP16 static inline __m512 +ConvertToFp32Sum(__m512h val) { + __m256h val_lower = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 0)); + __m256h val_upper = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 1)); + return _mm512_add_ps(_mm512_cvtxph_ps(val_lower), _mm512_cvtxph_ps(val_upper)); +} + +TARGET_AVX512FP16 float +HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx) +{ + float distance; + int i; + unsigned long mask; + + /* For FP16 computation */ + __m512h axi_512h; + __m512h bxi_512h; + __m512h diff_512h; + __m512h dist_512h = _mm512_setzero_ph(); + __m512h dist_512h_temp; + + /* For FP32 computation */ + __m256h axi_256h; + __m256h bxi_256h; + __m512 axi_512; + __m512 bxi_512; + __m512 diff_512; + __m512 dist_512; + + /* FP16 computation */ + for (i = 0; i < dim; i += 32) + { + if (dim - i < 32) + { + mask = (1 << (dim - i)) - 1; + axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_512h = _mm512_loadu_ph(ax + i); + bxi_512h = _mm512_loadu_ph(bx + i); + } + diff_512h = _mm512_sub_ph(axi_512h, bxi_512h); + dist_512h_temp = _mm512_fmadd_ph(diff_512h, diff_512h, dist_512h); + + /* if overflow, continue with FP32 */ + if (HasInfinity(dist_512h_temp)) + break; + else + dist_512h = dist_512h_temp; + } + dist_512 = ConvertToFp32Sum(dist_512h); + + /* FP32 computation */ + for (; i < dim; i += 16) + { + if (dim - i < 16) + { + mask = (1 << (dim - i)) - 1; + axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_256h = _mm256_loadu_ph(ax + i); + bxi_256h = _mm256_loadu_ph(bx + i); + } + axi_512 = _mm512_cvtxph_ps(axi_256h); + bxi_512 = _mm512_cvtxph_ps(bxi_256h); + diff_512 = _mm512_sub_ps(axi_512, bxi_512); + dist_512 = _mm512_fmadd_ps(diff_512, diff_512, dist_512); + } + + distance = _mm512_reduce_add_ps(dist_512); + return distance; +} + +TARGET_AVX512FP16 float +HalfvecInnerProductAvx512(int dim, half * ax, half * bx) +{ + float distance; + int i; + unsigned int mask; + + /* For FP16 computation */ + __m512h axi_512h; + __m512h bxi_512h; + __m512h dist_512h = _mm512_setzero_ph(); + __m512h dist_512h_temp; + + /* For FP32 computation */ + __m256h axi_256h; + __m256h bxi_256h; + __m512 axi_512; + __m512 bxi_512; + __m512 dist_512; + + /* FP16 computation */ + for (i = 0; i < dim; i += 32) + { + if (dim - i < 32) + { + mask = (1 << (dim - i)) - 1; + axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_512h = _mm512_loadu_ph(ax + i); + bxi_512h = _mm512_loadu_ph(bx + i); + } + dist_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, dist_512h); + + /* if overflow, continue with FP32 */ + if (HasInfinity(dist_512h_temp)) + break; + else + dist_512h = dist_512h_temp; + } + dist_512 = ConvertToFp32Sum(dist_512h); + + /* FP32 computation */ + for (; i < dim; i += 16) + { + if (dim - i < 16) + { + mask = (1 << (dim - i)) - 1; + axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_256h = _mm256_loadu_ph(ax + i); + bxi_256h = _mm256_loadu_ph(bx + i); + } + axi_512 = _mm512_cvtxph_ps(axi_256h); + bxi_512 = _mm512_cvtxph_ps(bxi_256h); + dist_512 = _mm512_fmadd_ps(axi_512, bxi_512, dist_512); + } + + distance = _mm512_reduce_add_ps(dist_512); + return distance; +} + +TARGET_AVX512FP16 double +HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx) +{ + float similarity; + float norma; + float normb; + int i; + unsigned int mask; + + /* For FP16 computation */ + __m512h axi_512h; + __m512h bxi_512h; + __m512h sim_512h = _mm512_setzero_ph(); + __m512h na_512h = _mm512_setzero_ph(); + __m512h nb_512h = _mm512_setzero_ph(); + __m512h sim_512h_temp; + __m512h na_512h_temp; + __m512h nb_512h_temp; + + /* For FP32 computation */ + __m256h axi_256h; + __m256h bxi_256h; + __m512 axi_512; + __m512 bxi_512; + __m512 sim_512; + __m512 na_512; + __m512 nb_512; + + /* FP16 computation */ + for (i = 0; i < dim; i += 32) + { + if (dim - i < 32) { + mask = (1 << (dim - i)) - 1; + axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + } + else { + axi_512h = _mm512_loadu_ph(ax + i); + bxi_512h = _mm512_loadu_ph(bx + i); + } + sim_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, sim_512h); + na_512h_temp = _mm512_fmadd_ph(axi_512h, axi_512h, na_512h); + nb_512h_temp = _mm512_fmadd_ph(bxi_512h, bxi_512h, nb_512h); + + /* if overflow, continue with FP32 */ + if (HasInfinity(sim_512h_temp) || + HasInfinity(na_512h_temp) || + HasInfinity(nb_512h_temp)) + break; + else + { + sim_512h = sim_512h_temp; + na_512h = na_512h_temp; + nb_512h = nb_512h_temp; + } + } + sim_512 = ConvertToFp32Sum(sim_512h); + na_512 = ConvertToFp32Sum(na_512h); + nb_512 = ConvertToFp32Sum(nb_512h); + + /* FP32 computation */ + for (; i < dim; i += 16) + { + if (dim - i < 16) + { + mask = (1 << (dim - i)) - 1; + axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_256h = _mm256_loadu_ph(ax + i); + bxi_256h = _mm256_loadu_ph(bx + i); + } + axi_512 = _mm512_cvtxph_ps(axi_256h); + bxi_512 = _mm512_cvtxph_ps(bxi_256h); + sim_512 = _mm512_fmadd_ps(axi_512, bxi_512, sim_512); + na_512 = _mm512_fmadd_ps(axi_512, axi_512, na_512); + nb_512 = _mm512_fmadd_ps(bxi_512, bxi_512, nb_512); + } + + similarity = _mm512_reduce_add_ps(sim_512); + norma = _mm512_reduce_add_ps(na_512); + normb = _mm512_reduce_add_ps(nb_512); + + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + return (double) similarity / sqrt((double) norma * (double) normb); +} + +TARGET_AVX512FP16 float +HalfvecL1DistanceAvx512(int dim, half * ax, half * bx) +{ + float distance; + int i; + unsigned long mask; + + /* For FP16 computation */ + __m512h axi_512h; + __m512h bxi_512h; + __m512h dist_512h = _mm512_setzero_ph(); + __m512h dist_512h_temp; + + /* For FP32 computation */ + __m256h axi_256h; + __m256h bxi_256h; + __m512 axi_512; + __m512 bxi_512; + __m512 dist_512; + + /* FP16 computation */ + for (i = 0; i < dim; i += 32) + { + if (dim - i < 32) + { + mask = (1 << (dim - i)) - 1; + axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i)); + bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_512h = _mm512_loadu_ph(ax + i); + bxi_512h = _mm512_loadu_ph(bx + i); + } + dist_512h_temp = _mm512_add_ph(dist_512h, _mm512_abs_ph(_mm512_sub_ph(axi_512h, bxi_512h))); + + /* if overflow, continue with FP32 */ + if (HasInfinity(dist_512h_temp)) + break; + else + dist_512h = dist_512h_temp; + } + dist_512 = ConvertToFp32Sum(dist_512h); + + /* FP32 computation */ + for (; i < dim; i += 16) + { + if (dim - i < 16) + { + mask = (1 << (dim - i)) - 1; + axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i)); + bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i)); + } + else + { + axi_256h = _mm256_loadu_ph(ax + i); + bxi_256h = _mm256_loadu_ph(bx + i); + } + axi_512 = _mm512_cvtxph_ps(axi_256h); + bxi_512 = _mm512_cvtxph_ps(bxi_256h); + dist_512 = _mm512_add_ps(dist_512, _mm512_abs_ps(_mm512_sub_ps(axi_512, bxi_512))); + } + + distance = _mm512_reduce_add_ps(dist_512); + + return distance; +} + +TARGET_AVX512FP16 void +Float4ToHalfVectorAvx512(Vector * vec, HalfVector * result) +{ + unsigned long mask; + __m512 vec_512; + __m256h vec_256h; + __mmask16 vec_512_inf; + __mmask16 vec_256h_inf; + + for (int i = 0; i < vec->dim; i += 16) + { + if (vec->dim - i < 16) + { + mask = (1 << (vec->dim - i)) - 1; + vec_512 = _mm512_maskz_loadu_ps(mask, vec->x + i); + vec_256h = _mm512_cvtxps_ph(vec_512); + _mm256_mask_storeu_epi16(result->x + i, mask, _mm256_castph_si256(vec_256h)); + } + else + { + vec_512 = _mm512_loadu_ps(vec->x + i); + vec_256h = _mm512_cvtxps_ph(vec_512); + _mm256_storeu_ph(result->x + i, vec_256h); + } + + /* Test for positive and negative infinity */ + vec_512_inf = _mm512_fpclass_ps_mask(vec_512, 0x08 + 0x10); + vec_256h_inf = _mm256_fpclass_ph_mask(vec_256h, 0x08 + 0x10); + if (unlikely(vec_512_inf != vec_256h_inf)) + { + float num; + char* buf; + + __mmask16 diff = _kxor_mask16(vec_512_inf, vec_256h_inf); + /* Find first element in vector to overflow after conversion (first bit set) */ + int count = 0; + while (diff % 2 == 0) { + diff >>= 1; + count++; + } + num = vec->x[i + count]; + + /* TODO Avoid duplicate code in Float4ToHalf */ + buf = palloc(FLOAT_SHORTEST_DECIMAL_LEN); + + float_to_shortest_decimal_buf(num, buf); + + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"%s\" is out of range for type halfvec", buf))); + } + } +} + +#define CPU_FEATURE_OSXSAVE (1 << 27) +#define CPU_FEATURE_AVX512F (1 << 16) +#define CPU_FEATURE_AVX512DQ (1 << 17) +#define CPU_FEATURE_AVX512_FP16 (1 << 23) +#define CPU_FEATURE_AVX512BW (1 << 30) +#define CPU_FEATURE_AVX512VL (1 << 31) + +#ifdef _MSC_VER +#define TARGET_XSAVE +#else +#define TARGET_XSAVE __attribute__((target("xsave"))) +#endif + +TARGET_XSAVE static bool +SupportsOsXsave() +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#else + __cpuid(exx, 1); +#endif + + return (exx[2] & CPU_FEATURE_OSXSAVE) == CPU_FEATURE_OSXSAVE; +} + +TARGET_XSAVE bool +SupportsAvx512Fp16() +{ + unsigned int exx[4] = {0, 0, 0, 0}; + + /* AVX512 features required: + * AVX512F : sub/fma/add instructions + * AVX512DQ: _mm512_extracti32x8_epi32 + * AVX512VL: _mm256_loadu_ph + * AVX512BW: masked loads + */ + unsigned int features = CPU_FEATURE_AVX512F | + CPU_FEATURE_AVX512DQ | + CPU_FEATURE_AVX512VL | + CPU_FEATURE_AVX512BW; + + /* Check OS supports XSAVE */ + if (!SupportsOsXsave()) + return false; + + /* Check XMM, YMM, and ZMM registers are enabled */ + if ((_xgetbv(0) & 0xe6) != 0xe6) + return false; + +#if defined(HAVE__GET_CPUID) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 7, 0); +#endif + + if ((exx[1] & features) != features) + return false; + + return (exx[3] & CPU_FEATURE_AVX512_FP16) == CPU_FEATURE_AVX512_FP16; +} + +#endif +#endif diff --git a/src/halfutils_avx512.h b/src/halfutils_avx512.h new file mode 100644 index 000000000..9f94f11dc --- /dev/null +++ b/src/halfutils_avx512.h @@ -0,0 +1,25 @@ +#ifndef HALFUTILS_AVX512_H +#define HALFUTILS_AVX512_H + +#ifdef USE_AVX512 +#include "postgres.h" +#include "halfvec.h" +#include "vector.h" + +#if (defined(__GNUC__) && (__GNUC__ >= 12)) || \ + (defined(__clang__) && (__clang_major__ >= 16)) || \ + (defined __AVX512FP16__) +#define HAVE_AVX512FP16 +#endif + +#ifdef HAVE_AVX512FP16 +extern float HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx); +extern float HalfvecInnerProductAvx512(int dim, half * ax, half * bx); +extern double HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx); +extern float HalfvecL1DistanceAvx512(int dim, half * ax, half * bx); +extern void Float4ToHalfVectorAvx512(Vector * vec, HalfVector * result); + +extern bool SupportsAvx512Fp16(void); +#endif +#endif +#endif