From c626e267f2146134c5e2e4efe931ec6bdccce92c Mon Sep 17 00:00:00 2001
From: Luca Giacchino <luca.giacchino@intel.com>
Date: Tue, 9 Apr 2024 09:08:26 -0700
Subject: [PATCH 1/5] Implement AVX512 distance calculations for halfvec

---
 src/halfutils.c                          | 400 +++++++++++++++++++++++
 src/halfutils.h                          |   3 +
 test/expected/halfvec_functions_fp16.out | 141 ++++++++
 test/sql/halfvec_functions_fp16.sql      |  31 ++
 4 files changed, 575 insertions(+)
 create mode 100644 test/expected/halfvec_functions_fp16.out
 create mode 100644 test/sql/halfvec_functions_fp16.sql

diff --git a/src/halfutils.c b/src/halfutils.c
index d16909409..9ef916803 100644
--- a/src/halfutils.c
+++ b/src/halfutils.c
@@ -2,6 +2,7 @@
 
 #include "halfutils.h"
 #include "halfvec.h"
+#include "utils/guc.h"
 
 #ifdef HALFVEC_DISPATCH
 #include <immintrin.h>
@@ -12,13 +13,23 @@
 #include <intrin.h>
 #endif
 
+#if (defined(__GNUC__) && (__GNUC__ >= 12)) || \
+	(defined(__clang__) && (__clang_major__ >= 16)) || \
+	(defined __AVX512FP16__)
+#define HAVE_AVX512FP16
+#endif
+
 #ifdef _MSC_VER
 #define TARGET_F16C
+#define TARGET_AVX512FP16
 #else
 #define TARGET_F16C __attribute__((target("avx,f16c,fma")))
+#define TARGET_AVX512FP16 __attribute__((target("avx512fp16,avx512f,avx512vl,avx512bw")))
 #endif
 #endif
 
+bool halfvec_use_fp16_compute;
+
 float		(*HalfvecL2SquaredDistance) (int dim, half * ax, half * bx);
 float		(*HalfvecInnerProduct) (int dim, half * ax, half * bx);
 double		(*HalfvecCosineSimilarity) (int dim, half * ax, half * bx);
@@ -74,6 +85,85 @@ HalfvecL2SquaredDistanceF16c(int dim, half * ax, half * bx)
 
 	return distance;
 }
+
+#ifdef HAVE_AVX512FP16
+TARGET_AVX512FP16 static float
+HalfvecL2SquaredDistanceAvx512Fp16(int dim, half * ax, half * bx)
+{
+	float		distance;
+	int			i;
+	int			count = (dim / 32) * 32;
+	unsigned long mask;
+	__m512h		axi;
+	__m512h		bxi;
+	__m512h		diff;
+	__m512h		dist = _mm512_setzero_ph();
+
+	for (i = 0; i < count; i += 32)
+	{
+		axi = _mm512_loadu_ph(ax+i);
+		bxi = _mm512_loadu_ph(bx+i);
+		diff = _mm512_sub_ph(axi, bxi);
+		dist = _mm512_fmadd_ph(diff, diff, dist);
+	}
+
+	mask = (1 << (dim - i)) - 1;
+	axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+	bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+	diff = _mm512_sub_ph(axi, bxi);
+	dist = _mm512_fmadd_ph(diff, diff, dist);
+
+	distance = (float)_mm512_reduce_add_ph(dist);
+
+	return distance;
+}
+
+TARGET_AVX512FP16 static float
+HalfvecL2SquaredDistanceAvx512Fp32(int dim, half * ax, half * bx)
+{
+	float		distance;
+	int			i;
+	int			count = (dim / 16) * 16;
+	unsigned long mask;
+	__m256h		axi;
+	__m256h		bxi;
+	__m512		axs;
+	__m512		bxs;
+	__m512		diff;
+	__m512		dist = _mm512_setzero_ps();
+
+	for (i = 0; i < count; i += 16)
+	{
+		axi = _mm256_loadu_ph(ax+i);
+		bxi = _mm256_loadu_ph(bx+i);
+		axs = _mm512_cvtxph_ps(axi);
+		bxs = _mm512_cvtxph_ps(bxi);
+		diff = _mm512_sub_ps(axs, bxs);
+		dist = _mm512_fmadd_ps(diff, diff, dist);
+	}
+	
+	mask = (1 << (dim - i)) - 1;
+	axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+	bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+	axs = _mm512_cvtxph_ps(axi);
+	bxs = _mm512_cvtxph_ps(bxi);
+	diff = _mm512_sub_ps(axs, bxs);
+	dist = _mm512_fmadd_ps(diff, diff, dist);
+
+	distance = (float)_mm512_reduce_add_ps(dist);
+
+	return distance;
+}
+
+static float
+HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx)
+{
+	if (halfvec_use_fp16_compute)
+		return HalfvecL2SquaredDistanceAvx512Fp16(dim, ax, bx);
+	else
+		return HalfvecL2SquaredDistanceAvx512Fp32(dim, ax, bx);
+}
+#endif
 #endif
 
 static float
@@ -117,6 +207,79 @@ HalfvecInnerProductF16c(int dim, half * ax, half * bx)
 
 	return distance;
 }
+
+#ifdef HAVE_AVX512FP16
+TARGET_AVX512FP16 static float
+HalfvecInnerProductAvx512Fp16(int dim, half * ax, half * bx)
+{
+	float		distance;
+	int			i;
+	int			count = (dim / 32) * 32;
+	unsigned long mask;
+	__m512h		axi;
+	__m512h		bxi;
+	__m512h		dist = _mm512_setzero_ph();
+
+	for (i = 0; i < count; i += 32)
+	{
+		axi = _mm512_loadu_ph(ax+i);
+		bxi = _mm512_loadu_ph(bx+i);
+		dist = _mm512_fmadd_ph(axi, bxi, dist);
+	}
+
+	mask = (1 << (dim - i)) - 1;
+	axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+	bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+	dist = _mm512_fmadd_ph(axi, bxi, dist);
+
+	distance = (float)_mm512_reduce_add_ph(dist);
+
+	return distance;
+}
+
+TARGET_AVX512FP16 static float
+HalfvecInnerProductAvx512Fp32(int dim, half * ax, half * bx)
+{
+	float		distance;
+	int			i;
+	int			count = (dim / 16) * 16;
+	unsigned long mask;
+	__m256h		axi;
+	__m256h		bxi;
+	__m512		axs;
+	__m512		bxs;
+	__m512		dist = _mm512_setzero_ps();
+
+	for (i = 0; i < count; i += 16)
+	{
+		axi = _mm256_loadu_ph(ax+i);
+		bxi = _mm256_loadu_ph(bx+i);
+		axs = _mm512_cvtxph_ps(axi);
+		bxs = _mm512_cvtxph_ps(bxi);
+		dist = _mm512_fmadd_ps(axs, bxs, dist);
+	}
+
+	mask = (1 << (dim - i)) - 1;
+	axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+	bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+	axs = _mm512_cvtxph_ps(axi);
+	bxs = _mm512_cvtxph_ps(bxi);
+	dist = _mm512_fmadd_ps(axs, bxs, dist);
+
+	distance = (float)_mm512_reduce_add_ps(dist);
+
+	return distance;
+}
+
+static float
+HalfvecInnerProductAvx512(int dim, half * ax, half * bx)
+{
+	if (halfvec_use_fp16_compute)
+		return HalfvecInnerProductAvx512Fp16(dim, ax, bx);
+	else
+		return HalfvecInnerProductAvx512Fp32(dim, ax, bx);
+}
+#endif
 #endif
 
 static double
@@ -190,6 +353,101 @@ HalfvecCosineSimilarityF16c(int dim, half * ax, half * bx)
 	/* Use sqrt(a * b) over sqrt(a) * sqrt(b) */
 	return (double) similarity / sqrt((double) norma * (double) normb);
 }
+
+#ifdef HAVE_AVX512FP16
+TARGET_AVX512FP16 static double
+HalfvecCosineSimilarityAvx512Fp16(int dim, half * ax, half * bx)
+{
+	float		similarity;
+	float		norma;
+	float		normb;
+	int			i;
+	int			count = (dim / 32) * 32;
+	unsigned long mask;
+	__m512h		axi;
+	__m512h		bxi;
+	__m512h		sim = _mm512_setzero_ph();
+	__m512h		na = _mm512_setzero_ph();
+	__m512h		nb = _mm512_setzero_ph();
+
+	for (i = 0; i < count; i += 32)
+	{
+		axi = _mm512_loadu_ph(ax+i);
+		bxi = _mm512_loadu_ph(bx+i);
+		sim = _mm512_fmadd_ph(axi, bxi, sim);
+		na = _mm512_fmadd_ph(axi, axi, na);
+		nb = _mm512_fmadd_ph(bxi, bxi, nb);
+	}
+
+	mask = (1 << (dim - i)) - 1;
+	axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+	bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+	sim = _mm512_fmadd_ph(axi, bxi, sim);
+	na = _mm512_fmadd_ph(axi, axi, na);
+	nb = _mm512_fmadd_ph(bxi, bxi, nb);
+
+	similarity = (float)_mm512_reduce_add_ph(sim);
+	norma = (float)_mm512_reduce_add_ph(na);
+	normb = (float)_mm512_reduce_add_ph(nb);
+
+	/* Use sqrt(a * b) over sqrt(a) * sqrt(b) */
+	return (double) similarity / sqrt((double) norma * (double) normb);
+}
+
+TARGET_AVX512FP16 static double
+HalfvecCosineSimilarityAvx512Fp32(int dim, half * ax, half * bx)
+{
+	float		similarity;
+	float		norma;
+	float		normb;
+	int			i;
+	int			count = (dim / 16) * 16;
+	unsigned long mask;
+	__m256h		axi;
+	__m256h 	bxi;
+	__m512		axs;
+	__m512		bxs;
+	__m512		sim = _mm512_setzero_ps();
+	__m512		na = _mm512_setzero_ps();
+	__m512		nb = _mm512_setzero_ps();
+
+	for (i = 0; i < count; i += 16)
+	{
+		axi = _mm256_loadu_ph(ax+i);
+		bxi = _mm256_loadu_ph(bx+i);
+		axs = _mm512_cvtxph_ps(axi);
+		bxs = _mm512_cvtxph_ps(bxi);
+		sim = _mm512_fmadd_ps(axs, bxs, sim);
+		na = _mm512_fmadd_ps(axs, axs, na);
+		nb = _mm512_fmadd_ps(bxs, bxs, nb);
+	}
+
+	mask = (1 << (dim - i)) - 1;
+	axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+	bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+	axs = _mm512_cvtxph_ps(axi);
+	bxs = _mm512_cvtxph_ps(bxi);
+	sim = _mm512_fmadd_ps(axs, bxs, sim);
+	na = _mm512_fmadd_ps(axs, axs, na);
+	nb = _mm512_fmadd_ps(bxs, bxs, nb);
+
+	similarity = (float)_mm512_reduce_add_ps(sim);
+	norma = (float)_mm512_reduce_add_ps(na);
+	normb = (float)_mm512_reduce_add_ps(nb);
+
+	/* Use sqrt(a * b) over sqrt(a) * sqrt(b) */
+	return (double) similarity / sqrt((double) norma * (double) normb);
+}
+
+static double
+HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx)
+{
+	if (halfvec_use_fp16_compute)
+		return HalfvecCosineSimilarityAvx512Fp16(dim, ax, bx);
+	else
+		return HalfvecCosineSimilarityAvx512Fp32(dim, ax, bx);
+}
+#endif
 #endif
 
 static float
@@ -235,6 +493,79 @@ HalfvecL1DistanceF16c(int dim, half * ax, half * bx)
 
 	return distance;
 }
+
+#ifdef HAVE_AVX512FP16
+TARGET_AVX512FP16 static float
+HalfvecL1DistanceAvx512Fp16(int dim, half * ax, half * bx)
+{
+	float		distance;
+	int			i;
+	int			count = (dim / 32) * 32;
+	unsigned long mask;
+	__m512h		axi;
+	__m512h		bxi;
+	__m512h		dist = _mm512_setzero_ph();
+
+	for (i = 0; i < count; i += 32)
+	{
+		axi = _mm512_loadu_ph(ax+i);
+		bxi = _mm512_loadu_ph(bx+i);
+		dist = _mm512_add_ph(dist, _mm512_abs_ph(_mm512_sub_ph(axi, bxi)));
+	}
+
+	mask = (1 << (dim - i)) - 1;
+	axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+	bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+	dist = _mm512_add_ph(dist, _mm512_abs_ph(_mm512_sub_ph(axi, bxi)));
+
+	distance = (float)_mm512_reduce_add_ph(dist);
+
+	return distance;
+}
+
+TARGET_AVX512FP16 static float
+HalfvecL1DistanceAvx512Fp32(int dim, half * ax, half * bx)
+{
+	float		distance;
+	int			i;
+	int			count = (dim / 16) * 16;
+	unsigned long mask;
+	__m256h		axi;
+	__m256h		bxi;
+	__m512		axs;
+	__m512		bxs;
+	__m512		dist = _mm512_setzero_ps();
+
+	for (i = 0; i < count; i += 16)
+	{
+		axi = _mm256_loadu_ph(ax+i);
+		bxi = _mm256_loadu_ph(bx+i);
+		axs = _mm512_cvtxph_ps(axi);
+		bxs = _mm512_cvtxph_ps(bxi);
+		dist = _mm512_add_ps(dist, _mm512_abs_ps(_mm512_sub_ps(axs, bxs)));
+	}
+
+	mask = (1 << (dim - i)) - 1;
+	axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+	bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+	axs = _mm512_cvtxph_ps(axi);
+	bxs = _mm512_cvtxph_ps(bxi);
+	dist = _mm512_add_ps(dist, _mm512_abs_ps(_mm512_sub_ps(axs, bxs)));
+
+	distance = (float)_mm512_reduce_add_ps(dist);
+
+	return distance;
+}
+
+static float
+HalfvecL1DistanceAvx512(int dim, half * ax, half * bx)
+{
+	if (halfvec_use_fp16_compute)
+		return HalfvecL1DistanceAvx512Fp16(dim, ax, bx);
+	else
+		return HalfvecL1DistanceAvx512Fp32(dim, ax, bx);
+}
+#endif
 #endif
 
 #ifdef HALFVEC_DISPATCH
@@ -271,6 +602,61 @@ SupportsCpuFeature(unsigned int feature)
 	/* Now check features */
 	return (exx[2] & feature) == feature;
 }
+
+#ifdef HAVE_AVX512FP16
+TARGET_XSAVE static bool
+SupportsOsXsave()
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#else
+	__cpuid(exx, 1);
+#endif
+
+	return (exx[2] & CPU_FEATURE_OSXSAVE) == CPU_FEATURE_OSXSAVE;
+}
+
+#define CPU_FEATURE_AVX512F         (1 << 16)
+#define CPU_FEATURE_AVX512_FP16     (1 << 23)
+#define CPU_FEATURE_AVX512_BW       (1 << 30)
+#define CPU_FEATURE_AVX512VL     	(1 << 31)
+
+TARGET_XSAVE static bool
+SupportsAvx512Fp16()
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+	/* Check OS supports XSAVE */
+	if (!SupportsOsXsave())
+		return false;
+
+	/* Check XMM, YMM, and ZMM registers are enabled */
+	if ((_xgetbv(0) & 0xe6) != 0xe6)
+		return false;
+		
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 7, 0);
+#endif
+
+	/* Required by AVX512 sub/fma/add instructions */
+	if ((exx[1] & CPU_FEATURE_AVX512F) != CPU_FEATURE_AVX512F)
+		return false;
+
+	/* Required by _mm256_loadu_ph */
+    if ((exx[1] & CPU_FEATURE_AVX512VL) != CPU_FEATURE_AVX512VL)
+	    return false;
+
+	/* Required by masked loads in remainder loops */
+	if ((exx[1] & CPU_FEATURE_AVX512_BW) != CPU_FEATURE_AVX512_BW)
+		return false;
+
+	return (exx[3] & CPU_FEATURE_AVX512_FP16) == CPU_FEATURE_AVX512_FP16;
+}
+#endif
 #endif
 
 void
@@ -294,5 +680,19 @@ HalfvecInit(void)
 		/* Does not require FMA, but keep logic simple */
 		HalfvecL1Distance = HalfvecL1DistanceF16c;
 	}
+
+#ifdef HAVE_AVX512FP16
+	if (SupportsAvx512Fp16())
+	{
+		HalfvecL2SquaredDistance = HalfvecL2SquaredDistanceAvx512;
+		HalfvecInnerProduct = HalfvecInnerProductAvx512;
+		HalfvecCosineSimilarity = HalfvecCosineSimilarityAvx512;
+		HalfvecL1Distance = HalfvecL1DistanceAvx512;
+	}
 #endif
+#endif
+
+	DefineCustomBoolVariable("halfvec.use_fp16_compute", "Use FP16 computation for distance calculations",
+							"If true, distance calculations are executed in FP16. If false, distance calculations are executed in FP32",
+							&halfvec_use_fp16_compute, false, PGC_USERSET, 0, NULL, NULL, NULL);
 }
diff --git a/src/halfutils.h b/src/halfutils.h
index c684f72d7..99131d583 100644
--- a/src/halfutils.h
+++ b/src/halfutils.h
@@ -17,6 +17,9 @@ extern float (*HalfvecL1Distance) (int dim, half * ax, half * bx);
 
 void		HalfvecInit(void);
 
+/* Variables */
+extern bool	halfvec_use_fp16_compute;
+
 /*
  * Check if half is NaN
  */
diff --git a/test/expected/halfvec_functions_fp16.out b/test/expected/halfvec_functions_fp16.out
new file mode 100644
index 000000000..a82a604b0
--- /dev/null
+++ b/test/expected/halfvec_functions_fp16.out
@@ -0,0 +1,141 @@
+SET halfvec.use_fp16_compute = true;
+SELECT l2_distance('[0,0]'::halfvec, '[3,4]');
+ l2_distance 
+-------------
+           5
+(1 row)
+
+SELECT l2_distance('[0,0]'::halfvec, '[0,1]');
+ l2_distance 
+-------------
+           1
+(1 row)
+
+SELECT l2_distance('[1,2]'::halfvec, '[3]');
+ERROR:  different halfvec dimensions 2 and 1
+SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]');
+ l2_distance 
+-------------
+           5
+(1 row)
+
+SELECT '[0,0]'::halfvec <-> '[3,4]';
+ ?column? 
+----------
+        5
+(1 row)
+
+SELECT inner_product('[1,2]'::halfvec, '[3,4]');
+ inner_product 
+---------------
+            11
+(1 row)
+
+SELECT inner_product('[1,2]'::halfvec, '[3]');
+ERROR:  different halfvec dimensions 2 and 1
+SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
+ inner_product 
+---------------
+            45
+(1 row)
+
+SELECT '[1,2]'::halfvec <#> '[3,4]';
+ ?column? 
+----------
+      -11
+(1 row)
+
+SELECT cosine_distance('[1,2]'::halfvec, '[2,4]');
+ cosine_distance 
+-----------------
+               0
+(1 row)
+
+SELECT cosine_distance('[1,2]'::halfvec, '[0,0]');
+ cosine_distance 
+-----------------
+             NaN
+(1 row)
+
+SELECT cosine_distance('[1,1]'::halfvec, '[1,1]');
+ cosine_distance 
+-----------------
+               0
+(1 row)
+
+SELECT cosine_distance('[1,0]'::halfvec, '[0,2]');
+ cosine_distance 
+-----------------
+               1
+(1 row)
+
+SELECT cosine_distance('[1,1]'::halfvec, '[-1,-1]');
+ cosine_distance 
+-----------------
+               2
+(1 row)
+
+SELECT cosine_distance('[1,2]'::halfvec, '[3]');
+ERROR:  different halfvec dimensions 2 and 1
+SELECT cosine_distance('[1,1]'::halfvec, '[1.1,1.1]');
+ cosine_distance 
+-----------------
+               0
+(1 row)
+
+SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]');
+ cosine_distance 
+-----------------
+               2
+(1 row)
+
+SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
+ cosine_distance 
+-----------------
+               0
+(1 row)
+
+SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]');
+ cosine_distance 
+-----------------
+               2
+(1 row)
+
+SELECT '[1,2]'::halfvec <=> '[2,4]';
+ ?column? 
+----------
+        0
+(1 row)
+
+SELECT l1_distance('[0,0]'::halfvec, '[3,4]');
+ l1_distance 
+-------------
+           7
+(1 row)
+
+SELECT l1_distance('[0,0]'::halfvec, '[0,1]');
+ l1_distance 
+-------------
+           1
+(1 row)
+
+SELECT l1_distance('[1,2]'::halfvec, '[3]');
+ERROR:  different halfvec dimensions 2 and 1
+SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
+ l1_distance 
+-------------
+           0
+(1 row)
+
+SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]');
+ l1_distance 
+-------------
+           9
+(1 row)
+
+SELECT '[0,0]'::halfvec <+> '[3,4]';
+ ?column? 
+----------
+        7
+(1 row)
+
diff --git a/test/sql/halfvec_functions_fp16.sql b/test/sql/halfvec_functions_fp16.sql
new file mode 100644
index 000000000..e930a840e
--- /dev/null
+++ b/test/sql/halfvec_functions_fp16.sql
@@ -0,0 +1,31 @@
+SET halfvec.use_fp16_compute = true;
+SELECT l2_distance('[0,0]'::halfvec, '[3,4]');
+SELECT l2_distance('[0,0]'::halfvec, '[0,1]');
+SELECT l2_distance('[1,2]'::halfvec, '[3]');
+SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]');
+SELECT '[0,0]'::halfvec <-> '[3,4]';
+
+SELECT inner_product('[1,2]'::halfvec, '[3,4]');
+SELECT inner_product('[1,2]'::halfvec, '[3]');
+SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
+SELECT '[1,2]'::halfvec <#> '[3,4]';
+
+SELECT cosine_distance('[1,2]'::halfvec, '[2,4]');
+SELECT cosine_distance('[1,2]'::halfvec, '[0,0]');
+SELECT cosine_distance('[1,1]'::halfvec, '[1,1]');
+SELECT cosine_distance('[1,0]'::halfvec, '[0,2]');
+SELECT cosine_distance('[1,1]'::halfvec, '[-1,-1]');
+SELECT cosine_distance('[1,2]'::halfvec, '[3]');
+SELECT cosine_distance('[1,1]'::halfvec, '[1.1,1.1]');
+SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]');
+SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
+SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]');
+SELECT '[1,2]'::halfvec <=> '[2,4]';
+
+SELECT l1_distance('[0,0]'::halfvec, '[3,4]');
+SELECT l1_distance('[0,0]'::halfvec, '[0,1]');
+SELECT l1_distance('[1,2]'::halfvec, '[3]');
+SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
+SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]');
+SELECT '[0,0]'::halfvec <+> '[3,4]';
+

From 38518c5cd6ef01e3e2433d03285b602e08279739 Mon Sep 17 00:00:00 2001
From: Luca Giacchino <luca.giacchino@intel.com>
Date: Wed, 25 Sep 2024 14:33:58 -0700
Subject: [PATCH 2/5] Automate fp16/fp32 halfvec distance computation

---
 src/halfutils.c                          | 539 +++++++++++------------
 src/halfutils.h                          |   3 -
 test/expected/halfvec.out                |  66 +++
 test/expected/halfvec_functions_fp16.out | 141 ------
 test/sql/halfvec.sql                     |  11 +
 test/sql/halfvec_functions_fp16.sql      |  31 --
 6 files changed, 343 insertions(+), 448 deletions(-)
 delete mode 100644 test/expected/halfvec_functions_fp16.out
 delete mode 100644 test/sql/halfvec_functions_fp16.sql

diff --git a/src/halfutils.c b/src/halfutils.c
index 9ef916803..4b4cf9d4b 100644
--- a/src/halfutils.c
+++ b/src/halfutils.c
@@ -2,7 +2,6 @@
 
 #include "halfutils.h"
 #include "halfvec.h"
-#include "utils/guc.h"
 
 #ifdef HALFVEC_DISPATCH
 #include <immintrin.h>
@@ -24,12 +23,10 @@
 #define TARGET_AVX512FP16
 #else
 #define TARGET_F16C __attribute__((target("avx,f16c,fma")))
-#define TARGET_AVX512FP16 __attribute__((target("avx512fp16,avx512f,avx512vl,avx512bw")))
+#define TARGET_AVX512FP16 __attribute__((target("avx512fp16,avx512f,avx512dq,avx512vl,avx512bw")))
 #endif
 #endif
 
-bool halfvec_use_fp16_compute;
-
 float		(*HalfvecL2SquaredDistance) (int dim, half * ax, half * bx);
 float		(*HalfvecInnerProduct) (int dim, half * ax, half * bx);
 double		(*HalfvecCosineSimilarity) (int dim, half * ax, half * bx);
@@ -87,82 +84,90 @@ HalfvecL2SquaredDistanceF16c(int dim, half * ax, half * bx)
 }
 
 #ifdef HAVE_AVX512FP16
-TARGET_AVX512FP16 static float
-HalfvecL2SquaredDistanceAvx512Fp16(int dim, half * ax, half * bx)
-{
-	float		distance;
-	int			i;
-	int			count = (dim / 32) * 32;
-	unsigned long mask;
-	__m512h		axi;
-	__m512h		bxi;
-	__m512h		diff;
-	__m512h		dist = _mm512_setzero_ph();
-
-	for (i = 0; i < count; i += 32)
-	{
-		axi = _mm512_loadu_ph(ax+i);
-		bxi = _mm512_loadu_ph(bx+i);
-		diff = _mm512_sub_ph(axi, bxi);
-		dist = _mm512_fmadd_ph(diff, diff, dist);
-	}
-
-	mask = (1 << (dim - i)) - 1;
-	axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
-	bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
-	diff = _mm512_sub_ph(axi, bxi);
-	dist = _mm512_fmadd_ph(diff, diff, dist);
-
-	distance = (float)_mm512_reduce_add_ph(dist);
+TARGET_AVX512FP16 static inline bool
+HasInfinity(__m512h val) {
+	/* Test for positive and negative infinity */
+	__mmask32 mask = _mm512_fpclass_ph_mask(val, 0x08 + 0x10);
+	return mask != 0;
+}
 
-	return distance;
+TARGET_AVX512FP16 static inline __m512
+ConvertToFp32Sum(__m512h val) {
+	__m256h val_lower = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 0));
+	__m256h val_upper = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 1));
+	return _mm512_add_ps(_mm512_cvtxph_ps(val_lower), _mm512_cvtxph_ps(val_upper));
 }
 
 TARGET_AVX512FP16 static float
-HalfvecL2SquaredDistanceAvx512Fp32(int dim, half * ax, half * bx)
+HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx)
 {
 	float		distance;
 	int			i;
-	int			count = (dim / 16) * 16;
 	unsigned long mask;
-	__m256h		axi;
-	__m256h		bxi;
-	__m512		axs;
-	__m512		bxs;
-	__m512		diff;
-	__m512		dist = _mm512_setzero_ps();
-
-	for (i = 0; i < count; i += 16)
+
+	/* For FP16 computation */
+	__m512h		axi_512h;
+	__m512h		bxi_512h;
+	__m512h		diff_512h;
+	__m512h		dist_512h = _mm512_setzero_ph();
+	__m512h		dist_512h_temp;
+
+	/* For FP32 computation */
+	__m256h		axi_256h;
+	__m256h		bxi_256h;
+	__m512		axi_512;
+	__m512		bxi_512;
+	__m512		diff_512;
+	__m512		dist_512;
+
+	/* FP16 computation */
+	for (i = 0; i < dim; i += 32)
 	{
-		axi = _mm256_loadu_ph(ax+i);
-		bxi = _mm256_loadu_ph(bx+i);
-		axs = _mm512_cvtxph_ps(axi);
-		bxs = _mm512_cvtxph_ps(bxi);
-		diff = _mm512_sub_ps(axs, bxs);
-		dist = _mm512_fmadd_ps(diff, diff, dist);
+		if (dim - i < 32)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_512h = _mm512_loadu_ph(ax + i);
+			bxi_512h = _mm512_loadu_ph(bx + i);
+		}
+		diff_512h = _mm512_sub_ph(axi_512h, bxi_512h);
+		dist_512h_temp = _mm512_fmadd_ph(diff_512h, diff_512h, dist_512h);
+
+		/* if overflow, continue with FP32 */
+		if (HasInfinity(dist_512h_temp))
+			break;
+		else
+			dist_512h = dist_512h_temp;
 	}
-	
-	mask = (1 << (dim - i)) - 1;
-	axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
-	bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
-	axs = _mm512_cvtxph_ps(axi);
-	bxs = _mm512_cvtxph_ps(bxi);
-	diff = _mm512_sub_ps(axs, bxs);
-	dist = _mm512_fmadd_ps(diff, diff, dist);
+	dist_512 = ConvertToFp32Sum(dist_512h);
 
-	distance = (float)_mm512_reduce_add_ps(dist);
+	/* FP32 computation */
+	for (; i < dim; i += 16)
+	{
+		if (dim - i < 16)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_256h = _mm256_loadu_ph(ax + i);
+			bxi_256h = _mm256_loadu_ph(bx + i);
+		}
+		axi_512 = _mm512_cvtxph_ps(axi_256h);
+		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
+		diff_512 = _mm512_sub_ps(axi_512, bxi_512);
+		dist_512 = _mm512_fmadd_ps(diff_512, diff_512, dist_512);
+	}
 
+	distance = _mm512_reduce_add_ps(dist_512);
 	return distance;
 }
-
-static float
-HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx)
-{
-	if (halfvec_use_fp16_compute)
-		return HalfvecL2SquaredDistanceAvx512Fp16(dim, ax, bx);
-	else
-		return HalfvecL2SquaredDistanceAvx512Fp32(dim, ax, bx);
-}
 #endif
 #endif
 
@@ -210,75 +215,71 @@ HalfvecInnerProductF16c(int dim, half * ax, half * bx)
 
 #ifdef HAVE_AVX512FP16
 TARGET_AVX512FP16 static float
-HalfvecInnerProductAvx512Fp16(int dim, half * ax, half * bx)
+HalfvecInnerProductAvx512(int dim, half * ax, half * bx)
 {
 	float		distance;
 	int			i;
-	int			count = (dim / 32) * 32;
-	unsigned long mask;
-	__m512h		axi;
-	__m512h		bxi;
-	__m512h		dist = _mm512_setzero_ph();
-
-	for (i = 0; i < count; i += 32)
+	unsigned int mask;
+
+	/* For FP16 computation */
+	__m512h		axi_512h;
+	__m512h		bxi_512h;
+	__m512h		dist_512h = _mm512_setzero_ph();
+	__m512h		dist_512h_temp;
+
+	/* For FP32 computation */
+	__m256h		axi_256h;
+	__m256h		bxi_256h;
+	__m512		axi_512;
+	__m512		bxi_512;
+	__m512		dist_512;
+
+	/* FP16 computation */
+	for (i = 0; i < dim; i += 32)
 	{
-		axi = _mm512_loadu_ph(ax+i);
-		bxi = _mm512_loadu_ph(bx+i);
-		dist = _mm512_fmadd_ph(axi, bxi, dist);
+		if (dim - i < 32)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_512h = _mm512_loadu_ph(ax + i);
+			bxi_512h = _mm512_loadu_ph(bx + i);
+		}
+		dist_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, dist_512h);
+
+		/* if overflow, continue with FP32 */
+		if (HasInfinity(dist_512h_temp))
+			break;
+		else
+			dist_512h = dist_512h_temp;
 	}
+	dist_512 = ConvertToFp32Sum(dist_512h);
 
-	mask = (1 << (dim - i)) - 1;
-	axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
-	bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
-	dist = _mm512_fmadd_ph(axi, bxi, dist);
-
-	distance = (float)_mm512_reduce_add_ph(dist);
-
-	return distance;
-}
-
-TARGET_AVX512FP16 static float
-HalfvecInnerProductAvx512Fp32(int dim, half * ax, half * bx)
-{
-	float		distance;
-	int			i;
-	int			count = (dim / 16) * 16;
-	unsigned long mask;
-	__m256h		axi;
-	__m256h		bxi;
-	__m512		axs;
-	__m512		bxs;
-	__m512		dist = _mm512_setzero_ps();
-
-	for (i = 0; i < count; i += 16)
+	/* FP32 computation */
+	for (; i < dim; i += 16)
 	{
-		axi = _mm256_loadu_ph(ax+i);
-		bxi = _mm256_loadu_ph(bx+i);
-		axs = _mm512_cvtxph_ps(axi);
-		bxs = _mm512_cvtxph_ps(bxi);
-		dist = _mm512_fmadd_ps(axs, bxs, dist);
+		if (dim - i < 16)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_256h = _mm256_loadu_ph(ax + i);
+			bxi_256h = _mm256_loadu_ph(bx + i);
+		}
+		axi_512 = _mm512_cvtxph_ps(axi_256h);
+		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
+		dist_512 = _mm512_fmadd_ps(axi_512, bxi_512, dist_512);
 	}
 
-	mask = (1 << (dim - i)) - 1;
-	axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
-	bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
-	axs = _mm512_cvtxph_ps(axi);
-	bxs = _mm512_cvtxph_ps(bxi);
-	dist = _mm512_fmadd_ps(axs, bxs, dist);
-
-	distance = (float)_mm512_reduce_add_ps(dist);
-
+	distance = _mm512_reduce_add_ps(dist_512);
 	return distance;
 }
-
-static float
-HalfvecInnerProductAvx512(int dim, half * ax, half * bx)
-{
-	if (halfvec_use_fp16_compute)
-		return HalfvecInnerProductAvx512Fp16(dim, ax, bx);
-	else
-		return HalfvecInnerProductAvx512Fp32(dim, ax, bx);
-}
 #endif
 #endif
 
@@ -356,97 +357,93 @@ HalfvecCosineSimilarityF16c(int dim, half * ax, half * bx)
 
 #ifdef HAVE_AVX512FP16
 TARGET_AVX512FP16 static double
-HalfvecCosineSimilarityAvx512Fp16(int dim, half * ax, half * bx)
+HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx)
 {
 	float		similarity;
 	float		norma;
 	float		normb;
 	int			i;
-	int			count = (dim / 32) * 32;
-	unsigned long mask;
-	__m512h		axi;
-	__m512h		bxi;
-	__m512h		sim = _mm512_setzero_ph();
-	__m512h		na = _mm512_setzero_ph();
-	__m512h		nb = _mm512_setzero_ph();
-
-	for (i = 0; i < count; i += 32)
+	unsigned int mask;
+
+	/* For FP16 computation */
+	__m512h		axi_512h;
+	__m512h		bxi_512h;
+	__m512h		sim_512h = _mm512_setzero_ph();
+	__m512h		na_512h = _mm512_setzero_ph();
+	__m512h		nb_512h = _mm512_setzero_ph();
+	__m512h		sim_512h_temp;
+	__m512h		na_512h_temp;
+	__m512h		nb_512h_temp;
+
+	/* For FP32 computation */
+	__m256h		axi_256h;
+	__m256h 	bxi_256h;
+	__m512		axi_512;
+	__m512		bxi_512;
+	__m512		sim_512;
+	__m512		na_512;
+	__m512		nb_512;
+
+	/* FP16 computation */
+	for (i = 0; i < dim; i += 32)
 	{
-		axi = _mm512_loadu_ph(ax+i);
-		bxi = _mm512_loadu_ph(bx+i);
-		sim = _mm512_fmadd_ph(axi, bxi, sim);
-		na = _mm512_fmadd_ph(axi, axi, na);
-		nb = _mm512_fmadd_ph(bxi, bxi, nb);
+		if (dim - i < 32) {
+			mask = (1 << (dim - i)) - 1;
+			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+		}
+		else {
+			axi_512h = _mm512_loadu_ph(ax + i);
+			bxi_512h = _mm512_loadu_ph(bx + i);
+		}
+		sim_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, sim_512h);
+		na_512h_temp = _mm512_fmadd_ph(axi_512h, axi_512h, na_512h);
+		nb_512h_temp = _mm512_fmadd_ph(bxi_512h, bxi_512h, nb_512h);
+
+		/* if overflow, continue with FP32 */
+		if (HasInfinity(sim_512h_temp) ||
+			HasInfinity(na_512h_temp) ||
+			HasInfinity(nb_512h_temp))
+			break;
+		else
+		{
+			sim_512h = sim_512h_temp;
+			na_512h = na_512h_temp;
+			nb_512h = nb_512h_temp;
+		}
 	}
+	sim_512 = ConvertToFp32Sum(sim_512h);
+	na_512 = ConvertToFp32Sum(na_512h);
+	nb_512 = ConvertToFp32Sum(nb_512h);
 
-	mask = (1 << (dim - i)) - 1;
-	axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
-	bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
-	sim = _mm512_fmadd_ph(axi, bxi, sim);
-	na = _mm512_fmadd_ph(axi, axi, na);
-	nb = _mm512_fmadd_ph(bxi, bxi, nb);
-
-	similarity = (float)_mm512_reduce_add_ph(sim);
-	norma = (float)_mm512_reduce_add_ph(na);
-	normb = (float)_mm512_reduce_add_ph(nb);
-
-	/* Use sqrt(a * b) over sqrt(a) * sqrt(b) */
-	return (double) similarity / sqrt((double) norma * (double) normb);
-}
-
-TARGET_AVX512FP16 static double
-HalfvecCosineSimilarityAvx512Fp32(int dim, half * ax, half * bx)
-{
-	float		similarity;
-	float		norma;
-	float		normb;
-	int			i;
-	int			count = (dim / 16) * 16;
-	unsigned long mask;
-	__m256h		axi;
-	__m256h 	bxi;
-	__m512		axs;
-	__m512		bxs;
-	__m512		sim = _mm512_setzero_ps();
-	__m512		na = _mm512_setzero_ps();
-	__m512		nb = _mm512_setzero_ps();
-
-	for (i = 0; i < count; i += 16)
+	/* FP32 computation */
+	for (; i < dim; i += 16)
 	{
-		axi = _mm256_loadu_ph(ax+i);
-		bxi = _mm256_loadu_ph(bx+i);
-		axs = _mm512_cvtxph_ps(axi);
-		bxs = _mm512_cvtxph_ps(bxi);
-		sim = _mm512_fmadd_ps(axs, bxs, sim);
-		na = _mm512_fmadd_ps(axs, axs, na);
-		nb = _mm512_fmadd_ps(bxs, bxs, nb);
+		if (dim - i < 16)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_256h = _mm256_loadu_ph(ax + i);
+			bxi_256h = _mm256_loadu_ph(bx + i);
+		}
+		axi_512 = _mm512_cvtxph_ps(axi_256h);
+		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
+		sim_512 = _mm512_fmadd_ps(axi_512, bxi_512, sim_512);
+		na_512 = _mm512_fmadd_ps(axi_512, axi_512, na_512);
+		nb_512 = _mm512_fmadd_ps(bxi_512, bxi_512, nb_512);
 	}
 
-	mask = (1 << (dim - i)) - 1;
-	axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
-	bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
-	axs = _mm512_cvtxph_ps(axi);
-	bxs = _mm512_cvtxph_ps(bxi);
-	sim = _mm512_fmadd_ps(axs, bxs, sim);
-	na = _mm512_fmadd_ps(axs, axs, na);
-	nb = _mm512_fmadd_ps(bxs, bxs, nb);
-
-	similarity = (float)_mm512_reduce_add_ps(sim);
-	norma = (float)_mm512_reduce_add_ps(na);
-	normb = (float)_mm512_reduce_add_ps(nb);
+	similarity = _mm512_reduce_add_ps(sim_512);
+	norma = _mm512_reduce_add_ps(na_512);
+	normb = _mm512_reduce_add_ps(nb_512);
 
 	/* Use sqrt(a * b) over sqrt(a) * sqrt(b) */
 	return (double) similarity / sqrt((double) norma * (double) normb);
 }
-
-static double
-HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx)
-{
-	if (halfvec_use_fp16_compute)
-		return HalfvecCosineSimilarityAvx512Fp16(dim, ax, bx);
-	else
-		return HalfvecCosineSimilarityAvx512Fp32(dim, ax, bx);
-}
 #endif
 #endif
 
@@ -496,75 +493,72 @@ HalfvecL1DistanceF16c(int dim, half * ax, half * bx)
 
 #ifdef HAVE_AVX512FP16
 TARGET_AVX512FP16 static float
-HalfvecL1DistanceAvx512Fp16(int dim, half * ax, half * bx)
+HalfvecL1DistanceAvx512(int dim, half * ax, half * bx)
 {
 	float		distance;
 	int			i;
-	int			count = (dim / 32) * 32;
 	unsigned long mask;
-	__m512h		axi;
-	__m512h		bxi;
-	__m512h		dist = _mm512_setzero_ph();
 
-	for (i = 0; i < count; i += 32)
+	/* For FP16 computation */
+	__m512h		axi_512h;
+	__m512h		bxi_512h;
+	__m512h		dist_512h = _mm512_setzero_ph();
+	__m512h		dist_512h_temp;
+
+	/* For FP32 computation */
+	__m256h		axi_256h;
+	__m256h		bxi_256h;
+	__m512		axi_512;
+	__m512		bxi_512;
+	__m512		dist_512;
+
+	/* FP16 computation */
+	for (i = 0; i < dim; i += 32)
 	{
-		axi = _mm512_loadu_ph(ax+i);
-		bxi = _mm512_loadu_ph(bx+i);
-		dist = _mm512_add_ph(dist, _mm512_abs_ph(_mm512_sub_ph(axi, bxi)));
+		if (dim - i < 32)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_512h = _mm512_loadu_ph(ax + i);
+			bxi_512h = _mm512_loadu_ph(bx + i);
+		}
+		dist_512h_temp = _mm512_add_ph(dist_512h, _mm512_abs_ph(_mm512_sub_ph(axi_512h, bxi_512h)));
+
+		/* if overflow, continue with FP32 */
+		if (HasInfinity(dist_512h_temp))
+			break;
+		else
+			dist_512h = dist_512h_temp;
 	}
+	dist_512 = ConvertToFp32Sum(dist_512h);
 
-	mask = (1 << (dim - i)) - 1;
-	axi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
-	bxi = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
-	dist = _mm512_add_ph(dist, _mm512_abs_ph(_mm512_sub_ph(axi, bxi)));
-
-	distance = (float)_mm512_reduce_add_ph(dist);
-
-	return distance;
-}
-
-TARGET_AVX512FP16 static float
-HalfvecL1DistanceAvx512Fp32(int dim, half * ax, half * bx)
-{
-	float		distance;
-	int			i;
-	int			count = (dim / 16) * 16;
-	unsigned long mask;
-	__m256h		axi;
-	__m256h		bxi;
-	__m512		axs;
-	__m512		bxs;
-	__m512		dist = _mm512_setzero_ps();
-
-	for (i = 0; i < count; i += 16)
+	/* FP32 computation */
+	for (; i < dim; i += 16)
 	{
-		axi = _mm256_loadu_ph(ax+i);
-		bxi = _mm256_loadu_ph(bx+i);
-		axs = _mm512_cvtxph_ps(axi);
-		bxs = _mm512_cvtxph_ps(bxi);
-		dist = _mm512_add_ps(dist, _mm512_abs_ps(_mm512_sub_ps(axs, bxs)));
+		if (dim - i < 16)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_256h = _mm256_loadu_ph(ax + i);
+			bxi_256h = _mm256_loadu_ph(bx + i);
+		}
+		axi_512 = _mm512_cvtxph_ps(axi_256h);
+		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
+		dist_512 = _mm512_add_ps(dist_512, _mm512_abs_ps(_mm512_sub_ps(axi_512, bxi_512)));
 	}
 
-	mask = (1 << (dim - i)) - 1;
-	axi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
-	bxi = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
-	axs = _mm512_cvtxph_ps(axi);
-	bxs = _mm512_cvtxph_ps(bxi);
-	dist = _mm512_add_ps(dist, _mm512_abs_ps(_mm512_sub_ps(axs, bxs)));
-
-	distance = (float)_mm512_reduce_add_ps(dist);
+	distance = _mm512_reduce_add_ps(dist_512);
 
 	return distance;
 }
-
-static float
-HalfvecL1DistanceAvx512(int dim, half * ax, half * bx)
-{
-	if (halfvec_use_fp16_compute)
-		return HalfvecL1DistanceAvx512Fp16(dim, ax, bx);
-	else
-		return HalfvecL1DistanceAvx512Fp32(dim, ax, bx);
-}
 #endif
 #endif
 
@@ -618,16 +612,28 @@ SupportsOsXsave()
 	return (exx[2] & CPU_FEATURE_OSXSAVE) == CPU_FEATURE_OSXSAVE;
 }
 
-#define CPU_FEATURE_AVX512F         (1 << 16)
-#define CPU_FEATURE_AVX512_FP16     (1 << 23)
-#define CPU_FEATURE_AVX512_BW       (1 << 30)
-#define CPU_FEATURE_AVX512VL     	(1 << 31)
+#define CPU_FEATURE_AVX512F     (1 << 16)
+#define CPU_FEATURE_AVX512DQ    (1 << 17)
+#define CPU_FEATURE_AVX512_FP16 (1 << 23)
+#define CPU_FEATURE_AVX512BW    (1 << 30)
+#define CPU_FEATURE_AVX512VL    (1 << 31)
 
 TARGET_XSAVE static bool
 SupportsAvx512Fp16()
 {
 	unsigned int exx[4] = {0, 0, 0, 0};
 
+	/* AVX512 features required:
+	 *  AVX512F : sub/fma/add instructions
+	 *  AVX512DQ: _mm512_extracti32x8_epi32
+	 *  AVX512VL: _mm256_loadu_ph
+	 *  AVX512BW: masked loads
+	 */
+	unsigned int features = CPU_FEATURE_AVX512F |
+	                        CPU_FEATURE_AVX512DQ |
+	                        CPU_FEATURE_AVX512VL |
+	                        CPU_FEATURE_AVX512BW;
+
 	/* Check OS supports XSAVE */
 	if (!SupportsOsXsave())
 		return false;
@@ -642,16 +648,7 @@ SupportsAvx512Fp16()
 	__cpuid(exx, 7, 0);
 #endif
 
-	/* Required by AVX512 sub/fma/add instructions */
-	if ((exx[1] & CPU_FEATURE_AVX512F) != CPU_FEATURE_AVX512F)
-		return false;
-
-	/* Required by _mm256_loadu_ph */
-    if ((exx[1] & CPU_FEATURE_AVX512VL) != CPU_FEATURE_AVX512VL)
-	    return false;
-
-	/* Required by masked loads in remainder loops */
-	if ((exx[1] & CPU_FEATURE_AVX512_BW) != CPU_FEATURE_AVX512_BW)
+	if ((exx[1] & features) != features)
 		return false;
 
 	return (exx[3] & CPU_FEATURE_AVX512_FP16) == CPU_FEATURE_AVX512_FP16;
@@ -691,8 +688,4 @@ HalfvecInit(void)
 	}
 #endif
 #endif
-
-	DefineCustomBoolVariable("halfvec.use_fp16_compute", "Use FP16 computation for distance calculations",
-							"If true, distance calculations are executed in FP16. If false, distance calculations are executed in FP32",
-							&halfvec_use_fp16_compute, false, PGC_USERSET, 0, NULL, NULL, NULL);
 }
diff --git a/src/halfutils.h b/src/halfutils.h
index 99131d583..c684f72d7 100644
--- a/src/halfutils.h
+++ b/src/halfutils.h
@@ -17,9 +17,6 @@ extern float (*HalfvecL1Distance) (int dim, half * ax, half * bx);
 
 void		HalfvecInit(void);
 
-/* Variables */
-extern bool	halfvec_use_fp16_compute;
-
 /*
  * Check if half is NaN
  */
diff --git a/test/expected/halfvec.out b/test/expected/halfvec.out
index 3ef913d6c..c3856c512 100644
--- a/test/expected/halfvec.out
+++ b/test/expected/halfvec.out
@@ -378,6 +378,24 @@ SELECT '[0,0]'::halfvec <-> '[3,4]';
         5
 (1 row)
 
+SELECT l2_distance('[501]'::halfvec, '[1]');
+ l2_distance 
+-------------
+         500
+(1 row)
+
+SELECT l2_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+ l2_distance 
+-------------
+           0
+(1 row)
+
+SELECT l2_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+ l2_distance 
+-------------
+           6
+(1 row)
+
 SELECT inner_product('[1,2]'::halfvec, '[3,4]');
  inner_product 
 ---------------
@@ -404,6 +422,24 @@ SELECT '[1,2]'::halfvec <#> '[3,4]';
       -11
 (1 row)
 
+SELECT inner_product('[50000,50000]'::halfvec, '[1,1]');
+ inner_product 
+---------------
+         99968
+(1 row)
+
+SELECT inner_product('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+ inner_product 
+---------------
+            36
+(1 row)
+
+SELECT inner_product('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+ inner_product 
+---------------
+            72
+(1 row)
+
 SELECT cosine_distance('[1,2]'::halfvec, '[2,4]');
  cosine_distance 
 -----------------
@@ -466,6 +502,24 @@ SELECT '[1,2]'::halfvec <=> '[2,4]';
         0
 (1 row)
 
+SELECT cosine_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+ cosine_distance 
+-----------------
+               0
+(1 row)
+
+SELECT cosine_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+ cosine_distance 
+-----------------
+               0
+(1 row)
+
+SELECT cosine_distance('[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]'::halfvec, '[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]');
+ cosine_distance 
+-----------------
+               1
+(1 row)
+
 SELECT l1_distance('[0,0]'::halfvec, '[3,4]');
  l1_distance 
 -------------
@@ -498,6 +552,18 @@ SELECT '[0,0]'::halfvec <+> '[3,4]';
         7
 (1 row)
 
+SELECT l1_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+ l1_distance 
+-------------
+           0
+(1 row)
+
+SELECT l1_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+ l1_distance 
+-------------
+          36
+(1 row)
+
 SELECT l2_normalize('[3,4]'::halfvec);
       l2_normalize      
 ------------------------
diff --git a/test/expected/halfvec_functions_fp16.out b/test/expected/halfvec_functions_fp16.out
deleted file mode 100644
index a82a604b0..000000000
--- a/test/expected/halfvec_functions_fp16.out
+++ /dev/null
@@ -1,141 +0,0 @@
-SET halfvec.use_fp16_compute = true;
-SELECT l2_distance('[0,0]'::halfvec, '[3,4]');
- l2_distance 
--------------
-           5
-(1 row)
-
-SELECT l2_distance('[0,0]'::halfvec, '[0,1]');
- l2_distance 
--------------
-           1
-(1 row)
-
-SELECT l2_distance('[1,2]'::halfvec, '[3]');
-ERROR:  different halfvec dimensions 2 and 1
-SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]');
- l2_distance 
--------------
-           5
-(1 row)
-
-SELECT '[0,0]'::halfvec <-> '[3,4]';
- ?column? 
-----------
-        5
-(1 row)
-
-SELECT inner_product('[1,2]'::halfvec, '[3,4]');
- inner_product 
----------------
-            11
-(1 row)
-
-SELECT inner_product('[1,2]'::halfvec, '[3]');
-ERROR:  different halfvec dimensions 2 and 1
-SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
- inner_product 
----------------
-            45
-(1 row)
-
-SELECT '[1,2]'::halfvec <#> '[3,4]';
- ?column? 
-----------
-      -11
-(1 row)
-
-SELECT cosine_distance('[1,2]'::halfvec, '[2,4]');
- cosine_distance 
------------------
-               0
-(1 row)
-
-SELECT cosine_distance('[1,2]'::halfvec, '[0,0]');
- cosine_distance 
------------------
-             NaN
-(1 row)
-
-SELECT cosine_distance('[1,1]'::halfvec, '[1,1]');
- cosine_distance 
------------------
-               0
-(1 row)
-
-SELECT cosine_distance('[1,0]'::halfvec, '[0,2]');
- cosine_distance 
------------------
-               1
-(1 row)
-
-SELECT cosine_distance('[1,1]'::halfvec, '[-1,-1]');
- cosine_distance 
------------------
-               2
-(1 row)
-
-SELECT cosine_distance('[1,2]'::halfvec, '[3]');
-ERROR:  different halfvec dimensions 2 and 1
-SELECT cosine_distance('[1,1]'::halfvec, '[1.1,1.1]');
- cosine_distance 
------------------
-               0
-(1 row)
-
-SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]');
- cosine_distance 
------------------
-               2
-(1 row)
-
-SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
- cosine_distance 
------------------
-               0
-(1 row)
-
-SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]');
- cosine_distance 
------------------
-               2
-(1 row)
-
-SELECT '[1,2]'::halfvec <=> '[2,4]';
- ?column? 
-----------
-        0
-(1 row)
-
-SELECT l1_distance('[0,0]'::halfvec, '[3,4]');
- l1_distance 
--------------
-           7
-(1 row)
-
-SELECT l1_distance('[0,0]'::halfvec, '[0,1]');
- l1_distance 
--------------
-           1
-(1 row)
-
-SELECT l1_distance('[1,2]'::halfvec, '[3]');
-ERROR:  different halfvec dimensions 2 and 1
-SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
- l1_distance 
--------------
-           0
-(1 row)
-
-SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]');
- l1_distance 
--------------
-           9
-(1 row)
-
-SELECT '[0,0]'::halfvec <+> '[3,4]';
- ?column? 
-----------
-        7
-(1 row)
-
diff --git a/test/sql/halfvec.sql b/test/sql/halfvec.sql
index 592a6e38a..744d03868 100644
--- a/test/sql/halfvec.sql
+++ b/test/sql/halfvec.sql
@@ -87,12 +87,18 @@ SELECT l2_distance('[0,0]'::halfvec, '[0,1]');
 SELECT l2_distance('[1,2]'::halfvec, '[3]');
 SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]');
 SELECT '[0,0]'::halfvec <-> '[3,4]';
+SELECT l2_distance('[501]'::halfvec, '[1]');
+SELECT l2_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+SELECT l2_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
 
 SELECT inner_product('[1,2]'::halfvec, '[3,4]');
 SELECT inner_product('[1,2]'::halfvec, '[3]');
 SELECT inner_product('[65504]'::halfvec, '[65504]');
 SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
 SELECT '[1,2]'::halfvec <#> '[3,4]';
+SELECT inner_product('[50000,50000]'::halfvec, '[1,1]');
+SELECT inner_product('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+SELECT inner_product('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
 
 SELECT cosine_distance('[1,2]'::halfvec, '[2,4]');
 SELECT cosine_distance('[1,2]'::halfvec, '[0,0]');
@@ -105,6 +111,9 @@ SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]');
 SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
 SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]');
 SELECT '[1,2]'::halfvec <=> '[2,4]';
+SELECT cosine_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+SELECT cosine_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+SELECT cosine_distance('[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]'::halfvec, '[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]');
 
 SELECT l1_distance('[0,0]'::halfvec, '[3,4]');
 SELECT l1_distance('[0,0]'::halfvec, '[0,1]');
@@ -112,6 +121,8 @@ SELECT l1_distance('[1,2]'::halfvec, '[3]');
 SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
 SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]');
 SELECT '[0,0]'::halfvec <+> '[3,4]';
+SELECT l1_distance('[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
+SELECT l1_distance('[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]'::halfvec, '[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]');
 
 SELECT l2_normalize('[3,4]'::halfvec);
 SELECT l2_normalize('[3,0]'::halfvec);
diff --git a/test/sql/halfvec_functions_fp16.sql b/test/sql/halfvec_functions_fp16.sql
deleted file mode 100644
index e930a840e..000000000
--- a/test/sql/halfvec_functions_fp16.sql
+++ /dev/null
@@ -1,31 +0,0 @@
-SET halfvec.use_fp16_compute = true;
-SELECT l2_distance('[0,0]'::halfvec, '[3,4]');
-SELECT l2_distance('[0,0]'::halfvec, '[0,1]');
-SELECT l2_distance('[1,2]'::halfvec, '[3]');
-SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]');
-SELECT '[0,0]'::halfvec <-> '[3,4]';
-
-SELECT inner_product('[1,2]'::halfvec, '[3,4]');
-SELECT inner_product('[1,2]'::halfvec, '[3]');
-SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
-SELECT '[1,2]'::halfvec <#> '[3,4]';
-
-SELECT cosine_distance('[1,2]'::halfvec, '[2,4]');
-SELECT cosine_distance('[1,2]'::halfvec, '[0,0]');
-SELECT cosine_distance('[1,1]'::halfvec, '[1,1]');
-SELECT cosine_distance('[1,0]'::halfvec, '[0,2]');
-SELECT cosine_distance('[1,1]'::halfvec, '[-1,-1]');
-SELECT cosine_distance('[1,2]'::halfvec, '[3]');
-SELECT cosine_distance('[1,1]'::halfvec, '[1.1,1.1]');
-SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]');
-SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
-SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]');
-SELECT '[1,2]'::halfvec <=> '[2,4]';
-
-SELECT l1_distance('[0,0]'::halfvec, '[3,4]');
-SELECT l1_distance('[0,0]'::halfvec, '[0,1]');
-SELECT l1_distance('[1,2]'::halfvec, '[3]');
-SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]');
-SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]');
-SELECT '[0,0]'::halfvec <+> '[3,4]';
-

From 9e35aa8175f5cd7b56883cb00758177be5e67870 Mon Sep 17 00:00:00 2001
From: Luca Giacchino <luca.giacchino@intel.com>
Date: Mon, 28 Oct 2024 13:50:32 -0700
Subject: [PATCH 3/5] Implement AVX512 vector_to_halfvec conversion

---
 src/halfutils.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/halfutils.h |  3 +++
 src/halfvec.c   |  3 +--
 3 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/src/halfutils.c b/src/halfutils.c
index 4b4cf9d4b..e6048bab1 100644
--- a/src/halfutils.c
+++ b/src/halfutils.c
@@ -32,6 +32,8 @@ float		(*HalfvecInnerProduct) (int dim, half * ax, half * bx);
 double		(*HalfvecCosineSimilarity) (int dim, half * ax, half * bx);
 float		(*HalfvecL1Distance) (int dim, half * ax, half * bx);
 
+void 		(*Float4ToHalfVector) (Vector * vec, HalfVector * result);
+
 static float
 HalfvecL2SquaredDistanceDefault(int dim, half * ax, half * bx)
 {
@@ -562,6 +564,67 @@ HalfvecL1DistanceAvx512(int dim, half * ax, half * bx)
 #endif
 #endif
 
+static void
+Float4ToHalfVectorDefault(Vector * vec, HalfVector * result) {
+	for (int i = 0; i < vec->dim; i++)
+		result->x[i] = Float4ToHalf(vec->x[i]);
+}
+
+#ifdef HAVE_AVX512FP16
+TARGET_AVX512FP16 static void
+Float4ToHalfVectorAvx512(Vector * vec, HalfVector * result) {
+	unsigned long mask;
+	__m512		vec_512;
+	__m256h		vec_256h;
+	__mmask16 	vec_512_inf;
+	__mmask16 	vec_256h_inf;
+
+	for (int i = 0; i < vec->dim; i += 16)
+	{
+		if (vec->dim - i < 16)
+		{
+			mask = (1 << (vec->dim - i)) - 1;
+			vec_512 = _mm512_maskz_loadu_ps(mask, vec->x + i);
+			vec_256h = _mm512_cvtxps_ph(vec_512);
+			_mm256_mask_storeu_epi16(result->x + i, mask, _mm256_castph_si256(vec_256h));
+		}
+		else
+		{
+			vec_512 = _mm512_loadu_ps(vec->x + i);
+			vec_256h = _mm512_cvtxps_ph(vec_512);
+			_mm256_storeu_ph(result->x + i, vec_256h);
+		}
+
+		/* Test for positive and negative infinity */
+		vec_512_inf = _mm512_fpclass_ps_mask(vec_512, 0x08 + 0x10);
+		vec_256h_inf = _mm256_fpclass_ph_mask(vec_256h, 0x08 + 0x10);
+		if (unlikely(vec_512_inf != vec_256h_inf))
+		{
+			float num;
+			char* buf;
+
+			__mmask16 diff = _kxor_mask16(vec_512_inf, vec_256h_inf);
+			/* Find first element in vector to overflow after conversion (first bit set) */
+			int count = 0;
+			while (diff % 2 == 0) {
+				diff >>= 1;
+				count++;
+			}
+			num = vec->x[i + count];
+
+			/* TODO Avoid duplicate code in Float4ToHalf */
+			buf = palloc(FLOAT_SHORTEST_DECIMAL_LEN);
+
+			float_to_shortest_decimal_buf(num, buf);
+
+			ereport(ERROR,
+					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+					 errmsg("\"%s\" is out of range for type halfvec", buf)));
+		}
+	}
+}
+#endif
+
 #ifdef HALFVEC_DISPATCH
 #define CPU_FEATURE_FMA     (1 << 12)
 #define CPU_FEATURE_OSXSAVE (1 << 27)
@@ -667,6 +730,7 @@ HalfvecInit(void)
 	HalfvecInnerProduct = HalfvecInnerProductDefault;
 	HalfvecCosineSimilarity = HalfvecCosineSimilarityDefault;
 	HalfvecL1Distance = HalfvecL1DistanceDefault;
+	Float4ToHalfVector = Float4ToHalfVectorDefault;
 
 #ifdef HALFVEC_DISPATCH
 	if (SupportsCpuFeature(CPU_FEATURE_AVX | CPU_FEATURE_F16C | CPU_FEATURE_FMA))
@@ -685,6 +749,7 @@ HalfvecInit(void)
 		HalfvecInnerProduct = HalfvecInnerProductAvx512;
 		HalfvecCosineSimilarity = HalfvecCosineSimilarityAvx512;
 		HalfvecL1Distance = HalfvecL1DistanceAvx512;
+		Float4ToHalfVector = Float4ToHalfVectorAvx512;
 	}
 #endif
 #endif
diff --git a/src/halfutils.h b/src/halfutils.h
index c684f72d7..1a33d1748 100644
--- a/src/halfutils.h
+++ b/src/halfutils.h
@@ -5,6 +5,7 @@
 
 #include "common/shortest_dec.h"
 #include "halfvec.h"
+#include "vector.h"
 
 #ifdef F16C_SUPPORT
 #include <immintrin.h>
@@ -15,6 +16,8 @@ extern float (*HalfvecInnerProduct) (int dim, half * ax, half * bx);
 extern double (*HalfvecCosineSimilarity) (int dim, half * ax, half * bx);
 extern float (*HalfvecL1Distance) (int dim, half * ax, half * bx);
 
+extern void (*Float4ToHalfVector) (Vector * vec, HalfVector * result);
+
 void		HalfvecInit(void);
 
 /*
diff --git a/src/halfvec.c b/src/halfvec.c
index 6b926e1ee..f9a42a559 100644
--- a/src/halfvec.c
+++ b/src/halfvec.c
@@ -533,8 +533,7 @@ vector_to_halfvec(PG_FUNCTION_ARGS)
 
 	result = InitHalfVector(vec->dim);
 
-	for (int i = 0; i < vec->dim; i++)
-		result->x[i] = Float4ToHalf(vec->x[i]);
+	Float4ToHalfVector(vec, result);
 
 	PG_RETURN_POINTER(result);
 }

From e93f55eeec9c976e4aa826e67a44e21b5e6bb28e Mon Sep 17 00:00:00 2001
From: Luca Giacchino <luca.giacchino@intel.com>
Date: Tue, 26 Nov 2024 19:08:16 -0800
Subject: [PATCH 4/5] Add CI job for AVX512 FP16 optimized path

---
 .github/workflows/build.yml | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 50e65a816..9f541f131 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -41,6 +41,38 @@ jobs:
           sudo apt-get update
           sudo apt-get install libipc-run-perl
       - run: make prove_installcheck
+  ubuntu_spr:
+    runs-on: ubuntu-24.04
+    if: ${{ !startsWith(github.ref_name, 'mac') && !startsWith(github.ref_name, 'windows') }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Intel SDE
+        run: |
+          curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/831748/sde-external-9.44.0-2024-08-22-lin.tar.xz
+          mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
+          sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
+      - name: Install Postgres
+        run: |
+          sudo apt-get install postgresql-16
+          sudo apt-get install postgresql-server-dev-16
+          sudo systemctl stop postgresql
+          pgdir=$(pg_config --bindir)
+          pgdata=/tmp/postgres_data
+          mkdir $pgdata
+          $pgdir/initdb -D $pgdata --username=$USER
+          sudo chmod 777 /var/run/postgresql/
+          sde -spr -mix -- $pgdir/pg_ctl -D $pgdata start
+      - run: make
+        env:
+          PG_CFLAGS: -DUSE_ASSERT_CHECKING -Wall -Wextra -Werror -Wno-unused-parameter -Wno-sign-compare
+      - run: |
+          export PG_CONFIG=`which pg_config`
+          sudo --preserve-env=PG_CONFIG make install
+      - run: sde -spr -mix -- make installcheck
+      - if: ${{ failure() }}
+        run: cat regression.diffs
+      - name: Report AVX512 FP16 FMA instruction count
+        run: cat sde*.txt | grep -E "FMA.*PH"
   mac:
     runs-on: ${{ matrix.os }}
     if: ${{ !startsWith(github.ref_name, 'windows') }}

From f7cf963035c7a27efaa4120484f46a8718c4359f Mon Sep 17 00:00:00 2001
From: Luca Giacchino <luca.giacchino@intel.com>
Date: Tue, 25 Mar 2025 13:41:38 -0700
Subject: [PATCH 5/5] Move AVX-512 functions to separate files

---
 Makefile               |   6 +
 src/halfutils.c        | 440 +---------------------------------------
 src/halfutils_avx512.c | 449 +++++++++++++++++++++++++++++++++++++++++
 src/halfutils_avx512.h |  25 +++
 4 files changed, 482 insertions(+), 438 deletions(-)
 create mode 100644 src/halfutils_avx512.c
 create mode 100644 src/halfutils_avx512.h

diff --git a/Makefile b/Makefile
index d98d73b8c..e89ee1342 100644
--- a/Makefile
+++ b/Makefile
@@ -5,6 +5,9 @@ MODULE_big = vector
 DATA = $(wildcard sql/*--*--*.sql)
 DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql
 OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o
+ifneq ($(USE_AVX512), 0)
+	OBJS += src/halfutils_avx512.o
+endif
 HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
 
 TESTS = $(wildcard test/sql/*.sql)
@@ -31,6 +34,9 @@ endif
 # - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
 # - Clang (could use pragma instead) - https://llvm.org/docs/Vectorizers.html
 PG_CFLAGS += $(OPTFLAGS) -ftree-vectorize -fassociative-math -fno-signed-zeros -fno-trapping-math
+ifneq ($(USE_AVX512), 0)
+	PG_CFLAGS += -DUSE_AVX512
+endif
 
 # Debug GCC auto-vectorization
 # PG_CFLAGS += -fopt-info-vec
diff --git a/src/halfutils.c b/src/halfutils.c
index e6048bab1..bdc370a19 100644
--- a/src/halfutils.c
+++ b/src/halfutils.c
@@ -5,6 +5,7 @@
 
 #ifdef HALFVEC_DISPATCH
 #include <immintrin.h>
+#include "halfutils_avx512.h"
 
 #if defined(USE__GET_CPUID)
 #include <cpuid.h>
@@ -12,18 +13,10 @@
 #include <intrin.h>
 #endif
 
-#if (defined(__GNUC__) && (__GNUC__ >= 12)) || \
-	(defined(__clang__) && (__clang_major__ >= 16)) || \
-	(defined __AVX512FP16__)
-#define HAVE_AVX512FP16
-#endif
-
 #ifdef _MSC_VER
 #define TARGET_F16C
-#define TARGET_AVX512FP16
 #else
 #define TARGET_F16C __attribute__((target("avx,f16c,fma")))
-#define TARGET_AVX512FP16 __attribute__((target("avx512fp16,avx512f,avx512dq,avx512vl,avx512bw")))
 #endif
 #endif
 
@@ -84,93 +77,6 @@ HalfvecL2SquaredDistanceF16c(int dim, half * ax, half * bx)
 
 	return distance;
 }
-
-#ifdef HAVE_AVX512FP16
-TARGET_AVX512FP16 static inline bool
-HasInfinity(__m512h val) {
-	/* Test for positive and negative infinity */
-	__mmask32 mask = _mm512_fpclass_ph_mask(val, 0x08 + 0x10);
-	return mask != 0;
-}
-
-TARGET_AVX512FP16 static inline __m512
-ConvertToFp32Sum(__m512h val) {
-	__m256h val_lower = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 0));
-	__m256h val_upper = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 1));
-	return _mm512_add_ps(_mm512_cvtxph_ps(val_lower), _mm512_cvtxph_ps(val_upper));
-}
-
-TARGET_AVX512FP16 static float
-HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx)
-{
-	float		distance;
-	int			i;
-	unsigned long mask;
-
-	/* For FP16 computation */
-	__m512h		axi_512h;
-	__m512h		bxi_512h;
-	__m512h		diff_512h;
-	__m512h		dist_512h = _mm512_setzero_ph();
-	__m512h		dist_512h_temp;
-
-	/* For FP32 computation */
-	__m256h		axi_256h;
-	__m256h		bxi_256h;
-	__m512		axi_512;
-	__m512		bxi_512;
-	__m512		diff_512;
-	__m512		dist_512;
-
-	/* FP16 computation */
-	for (i = 0; i < dim; i += 32)
-	{
-		if (dim - i < 32)
-		{
-			mask = (1 << (dim - i)) - 1;
-			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
-			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
-		}
-		else
-		{
-			axi_512h = _mm512_loadu_ph(ax + i);
-			bxi_512h = _mm512_loadu_ph(bx + i);
-		}
-		diff_512h = _mm512_sub_ph(axi_512h, bxi_512h);
-		dist_512h_temp = _mm512_fmadd_ph(diff_512h, diff_512h, dist_512h);
-
-		/* if overflow, continue with FP32 */
-		if (HasInfinity(dist_512h_temp))
-			break;
-		else
-			dist_512h = dist_512h_temp;
-	}
-	dist_512 = ConvertToFp32Sum(dist_512h);
-
-	/* FP32 computation */
-	for (; i < dim; i += 16)
-	{
-		if (dim - i < 16)
-		{
-			mask = (1 << (dim - i)) - 1;
-			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
-			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
-		}
-		else
-		{
-			axi_256h = _mm256_loadu_ph(ax + i);
-			bxi_256h = _mm256_loadu_ph(bx + i);
-		}
-		axi_512 = _mm512_cvtxph_ps(axi_256h);
-		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
-		diff_512 = _mm512_sub_ps(axi_512, bxi_512);
-		dist_512 = _mm512_fmadd_ps(diff_512, diff_512, dist_512);
-	}
-
-	distance = _mm512_reduce_add_ps(dist_512);
-	return distance;
-}
-#endif
 #endif
 
 static float
@@ -214,75 +120,6 @@ HalfvecInnerProductF16c(int dim, half * ax, half * bx)
 
 	return distance;
 }
-
-#ifdef HAVE_AVX512FP16
-TARGET_AVX512FP16 static float
-HalfvecInnerProductAvx512(int dim, half * ax, half * bx)
-{
-	float		distance;
-	int			i;
-	unsigned int mask;
-
-	/* For FP16 computation */
-	__m512h		axi_512h;
-	__m512h		bxi_512h;
-	__m512h		dist_512h = _mm512_setzero_ph();
-	__m512h		dist_512h_temp;
-
-	/* For FP32 computation */
-	__m256h		axi_256h;
-	__m256h		bxi_256h;
-	__m512		axi_512;
-	__m512		bxi_512;
-	__m512		dist_512;
-
-	/* FP16 computation */
-	for (i = 0; i < dim; i += 32)
-	{
-		if (dim - i < 32)
-		{
-			mask = (1 << (dim - i)) - 1;
-			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
-			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
-		}
-		else
-		{
-			axi_512h = _mm512_loadu_ph(ax + i);
-			bxi_512h = _mm512_loadu_ph(bx + i);
-		}
-		dist_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, dist_512h);
-
-		/* if overflow, continue with FP32 */
-		if (HasInfinity(dist_512h_temp))
-			break;
-		else
-			dist_512h = dist_512h_temp;
-	}
-	dist_512 = ConvertToFp32Sum(dist_512h);
-
-	/* FP32 computation */
-	for (; i < dim; i += 16)
-	{
-		if (dim - i < 16)
-		{
-			mask = (1 << (dim - i)) - 1;
-			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
-			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
-		}
-		else
-		{
-			axi_256h = _mm256_loadu_ph(ax + i);
-			bxi_256h = _mm256_loadu_ph(bx + i);
-		}
-		axi_512 = _mm512_cvtxph_ps(axi_256h);
-		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
-		dist_512 = _mm512_fmadd_ps(axi_512, bxi_512, dist_512);
-	}
-
-	distance = _mm512_reduce_add_ps(dist_512);
-	return distance;
-}
-#endif
 #endif
 
 static double
@@ -356,97 +193,6 @@ HalfvecCosineSimilarityF16c(int dim, half * ax, half * bx)
 	/* Use sqrt(a * b) over sqrt(a) * sqrt(b) */
 	return (double) similarity / sqrt((double) norma * (double) normb);
 }
-
-#ifdef HAVE_AVX512FP16
-TARGET_AVX512FP16 static double
-HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx)
-{
-	float		similarity;
-	float		norma;
-	float		normb;
-	int			i;
-	unsigned int mask;
-
-	/* For FP16 computation */
-	__m512h		axi_512h;
-	__m512h		bxi_512h;
-	__m512h		sim_512h = _mm512_setzero_ph();
-	__m512h		na_512h = _mm512_setzero_ph();
-	__m512h		nb_512h = _mm512_setzero_ph();
-	__m512h		sim_512h_temp;
-	__m512h		na_512h_temp;
-	__m512h		nb_512h_temp;
-
-	/* For FP32 computation */
-	__m256h		axi_256h;
-	__m256h 	bxi_256h;
-	__m512		axi_512;
-	__m512		bxi_512;
-	__m512		sim_512;
-	__m512		na_512;
-	__m512		nb_512;
-
-	/* FP16 computation */
-	for (i = 0; i < dim; i += 32)
-	{
-		if (dim - i < 32) {
-			mask = (1 << (dim - i)) - 1;
-			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
-			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
-		}
-		else {
-			axi_512h = _mm512_loadu_ph(ax + i);
-			bxi_512h = _mm512_loadu_ph(bx + i);
-		}
-		sim_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, sim_512h);
-		na_512h_temp = _mm512_fmadd_ph(axi_512h, axi_512h, na_512h);
-		nb_512h_temp = _mm512_fmadd_ph(bxi_512h, bxi_512h, nb_512h);
-
-		/* if overflow, continue with FP32 */
-		if (HasInfinity(sim_512h_temp) ||
-			HasInfinity(na_512h_temp) ||
-			HasInfinity(nb_512h_temp))
-			break;
-		else
-		{
-			sim_512h = sim_512h_temp;
-			na_512h = na_512h_temp;
-			nb_512h = nb_512h_temp;
-		}
-	}
-	sim_512 = ConvertToFp32Sum(sim_512h);
-	na_512 = ConvertToFp32Sum(na_512h);
-	nb_512 = ConvertToFp32Sum(nb_512h);
-
-	/* FP32 computation */
-	for (; i < dim; i += 16)
-	{
-		if (dim - i < 16)
-		{
-			mask = (1 << (dim - i)) - 1;
-			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
-			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
-		}
-		else
-		{
-			axi_256h = _mm256_loadu_ph(ax + i);
-			bxi_256h = _mm256_loadu_ph(bx + i);
-		}
-		axi_512 = _mm512_cvtxph_ps(axi_256h);
-		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
-		sim_512 = _mm512_fmadd_ps(axi_512, bxi_512, sim_512);
-		na_512 = _mm512_fmadd_ps(axi_512, axi_512, na_512);
-		nb_512 = _mm512_fmadd_ps(bxi_512, bxi_512, nb_512);
-	}
-
-	similarity = _mm512_reduce_add_ps(sim_512);
-	norma = _mm512_reduce_add_ps(na_512);
-	normb = _mm512_reduce_add_ps(nb_512);
-
-	/* Use sqrt(a * b) over sqrt(a) * sqrt(b) */
-	return (double) similarity / sqrt((double) norma * (double) normb);
-}
-#endif
 #endif
 
 static float
@@ -492,76 +238,6 @@ HalfvecL1DistanceF16c(int dim, half * ax, half * bx)
 
 	return distance;
 }
-
-#ifdef HAVE_AVX512FP16
-TARGET_AVX512FP16 static float
-HalfvecL1DistanceAvx512(int dim, half * ax, half * bx)
-{
-	float		distance;
-	int			i;
-	unsigned long mask;
-
-	/* For FP16 computation */
-	__m512h		axi_512h;
-	__m512h		bxi_512h;
-	__m512h		dist_512h = _mm512_setzero_ph();
-	__m512h		dist_512h_temp;
-
-	/* For FP32 computation */
-	__m256h		axi_256h;
-	__m256h		bxi_256h;
-	__m512		axi_512;
-	__m512		bxi_512;
-	__m512		dist_512;
-
-	/* FP16 computation */
-	for (i = 0; i < dim; i += 32)
-	{
-		if (dim - i < 32)
-		{
-			mask = (1 << (dim - i)) - 1;
-			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
-			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
-		}
-		else
-		{
-			axi_512h = _mm512_loadu_ph(ax + i);
-			bxi_512h = _mm512_loadu_ph(bx + i);
-		}
-		dist_512h_temp = _mm512_add_ph(dist_512h, _mm512_abs_ph(_mm512_sub_ph(axi_512h, bxi_512h)));
-
-		/* if overflow, continue with FP32 */
-		if (HasInfinity(dist_512h_temp))
-			break;
-		else
-			dist_512h = dist_512h_temp;
-	}
-	dist_512 = ConvertToFp32Sum(dist_512h);
-
-	/* FP32 computation */
-	for (; i < dim; i += 16)
-	{
-		if (dim - i < 16)
-		{
-			mask = (1 << (dim - i)) - 1;
-			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
-			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
-		}
-		else
-		{
-			axi_256h = _mm256_loadu_ph(ax + i);
-			bxi_256h = _mm256_loadu_ph(bx + i);
-		}
-		axi_512 = _mm512_cvtxph_ps(axi_256h);
-		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
-		dist_512 = _mm512_add_ps(dist_512, _mm512_abs_ps(_mm512_sub_ps(axi_512, bxi_512)));
-	}
-
-	distance = _mm512_reduce_add_ps(dist_512);
-
-	return distance;
-}
-#endif
 #endif
 
 static void
@@ -570,60 +246,6 @@ Float4ToHalfVectorDefault(Vector * vec, HalfVector * result) {
 		result->x[i] = Float4ToHalf(vec->x[i]);
 }
 
-#ifdef HAVE_AVX512FP16
-TARGET_AVX512FP16 static void
-Float4ToHalfVectorAvx512(Vector * vec, HalfVector * result) {
-	unsigned long mask;
-	__m512		vec_512;
-	__m256h		vec_256h;
-	__mmask16 	vec_512_inf;
-	__mmask16 	vec_256h_inf;
-
-	for (int i = 0; i < vec->dim; i += 16)
-	{
-		if (vec->dim - i < 16)
-		{
-			mask = (1 << (vec->dim - i)) - 1;
-			vec_512 = _mm512_maskz_loadu_ps(mask, vec->x + i);
-			vec_256h = _mm512_cvtxps_ph(vec_512);
-			_mm256_mask_storeu_epi16(result->x + i, mask, _mm256_castph_si256(vec_256h));
-		}
-		else
-		{
-			vec_512 = _mm512_loadu_ps(vec->x + i);
-			vec_256h = _mm512_cvtxps_ph(vec_512);
-			_mm256_storeu_ph(result->x + i, vec_256h);
-		}
-
-		/* Test for positive and negative infinity */
-		vec_512_inf = _mm512_fpclass_ps_mask(vec_512, 0x08 + 0x10);
-		vec_256h_inf = _mm256_fpclass_ph_mask(vec_256h, 0x08 + 0x10);
-		if (unlikely(vec_512_inf != vec_256h_inf))
-		{
-			float num;
-			char* buf;
-
-			__mmask16 diff = _kxor_mask16(vec_512_inf, vec_256h_inf);
-			/* Find first element in vector to overflow after conversion (first bit set) */
-			int count = 0;
-			while (diff % 2 == 0) {
-				diff >>= 1;
-				count++;
-			}
-			num = vec->x[i + count];
-
-			/* TODO Avoid duplicate code in Float4ToHalf */
-			buf = palloc(FLOAT_SHORTEST_DECIMAL_LEN);
-
-			float_to_shortest_decimal_buf(num, buf);
-
-			ereport(ERROR,
-					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
-					 errmsg("\"%s\" is out of range for type halfvec", buf)));
-		}
-	}
-}
-#endif
 
 #ifdef HALFVEC_DISPATCH
 #define CPU_FEATURE_FMA     (1 << 12)
@@ -659,64 +281,6 @@ SupportsCpuFeature(unsigned int feature)
 	/* Now check features */
 	return (exx[2] & feature) == feature;
 }
-
-#ifdef HAVE_AVX512FP16
-TARGET_XSAVE static bool
-SupportsOsXsave()
-{
-	unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
-	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#else
-	__cpuid(exx, 1);
-#endif
-
-	return (exx[2] & CPU_FEATURE_OSXSAVE) == CPU_FEATURE_OSXSAVE;
-}
-
-#define CPU_FEATURE_AVX512F     (1 << 16)
-#define CPU_FEATURE_AVX512DQ    (1 << 17)
-#define CPU_FEATURE_AVX512_FP16 (1 << 23)
-#define CPU_FEATURE_AVX512BW    (1 << 30)
-#define CPU_FEATURE_AVX512VL    (1 << 31)
-
-TARGET_XSAVE static bool
-SupportsAvx512Fp16()
-{
-	unsigned int exx[4] = {0, 0, 0, 0};
-
-	/* AVX512 features required:
-	 *  AVX512F : sub/fma/add instructions
-	 *  AVX512DQ: _mm512_extracti32x8_epi32
-	 *  AVX512VL: _mm256_loadu_ph
-	 *  AVX512BW: masked loads
-	 */
-	unsigned int features = CPU_FEATURE_AVX512F |
-	                        CPU_FEATURE_AVX512DQ |
-	                        CPU_FEATURE_AVX512VL |
-	                        CPU_FEATURE_AVX512BW;
-
-	/* Check OS supports XSAVE */
-	if (!SupportsOsXsave())
-		return false;
-
-	/* Check XMM, YMM, and ZMM registers are enabled */
-	if ((_xgetbv(0) & 0xe6) != 0xe6)
-		return false;
-		
-#if defined(HAVE__GET_CPUID)
-	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
-	__cpuid(exx, 7, 0);
-#endif
-
-	if ((exx[1] & features) != features)
-		return false;
-
-	return (exx[3] & CPU_FEATURE_AVX512_FP16) == CPU_FEATURE_AVX512_FP16;
-}
-#endif
 #endif
 
 void
@@ -742,7 +306,7 @@ HalfvecInit(void)
 		HalfvecL1Distance = HalfvecL1DistanceF16c;
 	}
 
-#ifdef HAVE_AVX512FP16
+#if defined(USE_AVX512) && defined(HAVE_AVX512FP16)
 	if (SupportsAvx512Fp16())
 	{
 		HalfvecL2SquaredDistance = HalfvecL2SquaredDistanceAvx512;
diff --git a/src/halfutils_avx512.c b/src/halfutils_avx512.c
new file mode 100644
index 000000000..27d907781
--- /dev/null
+++ b/src/halfutils_avx512.c
@@ -0,0 +1,449 @@
+#ifdef USE_AVX512
+#include "halfutils_avx512.h"
+
+#ifdef HAVE_AVX512FP16
+#include "common/shortest_dec.h"
+
+#include <immintrin.h>
+#include <math.h>
+
+#if defined(USE__GET_CPUID)
+#include <cpuid.h>
+#else
+#include <intrin.h>
+#endif
+
+#ifdef _MSC_VER
+#define TARGET_AVX512FP16
+#else
+#define TARGET_AVX512FP16 __attribute__((target("avx512fp16,avx512f,avx512dq,avx512vl,avx512bw")))
+#endif
+
+TARGET_AVX512FP16 static inline bool
+HasInfinity(__m512h val) {
+	/* Test for positive and negative infinity */
+	__mmask32 mask = _mm512_fpclass_ph_mask(val, 0x08 + 0x10);
+	return mask != 0;
+}
+
+TARGET_AVX512FP16 static inline __m512
+ConvertToFp32Sum(__m512h val) {
+	__m256h val_lower = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 0));
+	__m256h val_upper = _mm256_castsi256_ph(_mm512_extracti32x8_epi32(_mm512_castph_si512(val), 1));
+	return _mm512_add_ps(_mm512_cvtxph_ps(val_lower), _mm512_cvtxph_ps(val_upper));
+}
+
+TARGET_AVX512FP16 float
+HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx)
+{
+	float		distance;
+	int			i;
+	unsigned long mask;
+
+	/* For FP16 computation */
+	__m512h		axi_512h;
+	__m512h		bxi_512h;
+	__m512h		diff_512h;
+	__m512h		dist_512h = _mm512_setzero_ph();
+	__m512h		dist_512h_temp;
+
+	/* For FP32 computation */
+	__m256h		axi_256h;
+	__m256h		bxi_256h;
+	__m512		axi_512;
+	__m512		bxi_512;
+	__m512		diff_512;
+	__m512		dist_512;
+
+	/* FP16 computation */
+	for (i = 0; i < dim; i += 32)
+	{
+		if (dim - i < 32)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_512h = _mm512_loadu_ph(ax + i);
+			bxi_512h = _mm512_loadu_ph(bx + i);
+		}
+		diff_512h = _mm512_sub_ph(axi_512h, bxi_512h);
+		dist_512h_temp = _mm512_fmadd_ph(diff_512h, diff_512h, dist_512h);
+
+		/* if overflow, continue with FP32 */
+		if (HasInfinity(dist_512h_temp))
+			break;
+		else
+			dist_512h = dist_512h_temp;
+	}
+	dist_512 = ConvertToFp32Sum(dist_512h);
+
+	/* FP32 computation */
+	for (; i < dim; i += 16)
+	{
+		if (dim - i < 16)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_256h = _mm256_loadu_ph(ax + i);
+			bxi_256h = _mm256_loadu_ph(bx + i);
+		}
+		axi_512 = _mm512_cvtxph_ps(axi_256h);
+		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
+		diff_512 = _mm512_sub_ps(axi_512, bxi_512);
+		dist_512 = _mm512_fmadd_ps(diff_512, diff_512, dist_512);
+	}
+
+	distance = _mm512_reduce_add_ps(dist_512);
+	return distance;
+}
+
+TARGET_AVX512FP16 float
+HalfvecInnerProductAvx512(int dim, half * ax, half * bx)
+{
+	float		distance;
+	int			i;
+	unsigned int mask;
+
+	/* For FP16 computation */
+	__m512h		axi_512h;
+	__m512h		bxi_512h;
+	__m512h		dist_512h = _mm512_setzero_ph();
+	__m512h		dist_512h_temp;
+
+	/* For FP32 computation */
+	__m256h		axi_256h;
+	__m256h		bxi_256h;
+	__m512		axi_512;
+	__m512		bxi_512;
+	__m512		dist_512;
+
+	/* FP16 computation */
+	for (i = 0; i < dim; i += 32)
+	{
+		if (dim - i < 32)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_512h = _mm512_loadu_ph(ax + i);
+			bxi_512h = _mm512_loadu_ph(bx + i);
+		}
+		dist_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, dist_512h);
+
+		/* if overflow, continue with FP32 */
+		if (HasInfinity(dist_512h_temp))
+			break;
+		else
+			dist_512h = dist_512h_temp;
+	}
+	dist_512 = ConvertToFp32Sum(dist_512h);
+
+	/* FP32 computation */
+	for (; i < dim; i += 16)
+	{
+		if (dim - i < 16)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_256h = _mm256_loadu_ph(ax + i);
+			bxi_256h = _mm256_loadu_ph(bx + i);
+		}
+		axi_512 = _mm512_cvtxph_ps(axi_256h);
+		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
+		dist_512 = _mm512_fmadd_ps(axi_512, bxi_512, dist_512);
+	}
+
+	distance = _mm512_reduce_add_ps(dist_512);
+	return distance;
+}
+
+TARGET_AVX512FP16 double
+HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx)
+{
+	float		similarity;
+	float		norma;
+	float		normb;
+	int			i;
+	unsigned int mask;
+
+	/* For FP16 computation */
+	__m512h		axi_512h;
+	__m512h		bxi_512h;
+	__m512h		sim_512h = _mm512_setzero_ph();
+	__m512h		na_512h = _mm512_setzero_ph();
+	__m512h		nb_512h = _mm512_setzero_ph();
+	__m512h		sim_512h_temp;
+	__m512h		na_512h_temp;
+	__m512h		nb_512h_temp;
+
+	/* For FP32 computation */
+	__m256h		axi_256h;
+	__m256h 	bxi_256h;
+	__m512		axi_512;
+	__m512		bxi_512;
+	__m512		sim_512;
+	__m512		na_512;
+	__m512		nb_512;
+
+	/* FP16 computation */
+	for (i = 0; i < dim; i += 32)
+	{
+		if (dim - i < 32) {
+			mask = (1 << (dim - i)) - 1;
+			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+		}
+		else {
+			axi_512h = _mm512_loadu_ph(ax + i);
+			bxi_512h = _mm512_loadu_ph(bx + i);
+		}
+		sim_512h_temp = _mm512_fmadd_ph(axi_512h, bxi_512h, sim_512h);
+		na_512h_temp = _mm512_fmadd_ph(axi_512h, axi_512h, na_512h);
+		nb_512h_temp = _mm512_fmadd_ph(bxi_512h, bxi_512h, nb_512h);
+
+		/* if overflow, continue with FP32 */
+		if (HasInfinity(sim_512h_temp) ||
+			HasInfinity(na_512h_temp) ||
+			HasInfinity(nb_512h_temp))
+			break;
+		else
+		{
+			sim_512h = sim_512h_temp;
+			na_512h = na_512h_temp;
+			nb_512h = nb_512h_temp;
+		}
+	}
+	sim_512 = ConvertToFp32Sum(sim_512h);
+	na_512 = ConvertToFp32Sum(na_512h);
+	nb_512 = ConvertToFp32Sum(nb_512h);
+
+	/* FP32 computation */
+	for (; i < dim; i += 16)
+	{
+		if (dim - i < 16)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_256h = _mm256_loadu_ph(ax + i);
+			bxi_256h = _mm256_loadu_ph(bx + i);
+		}
+		axi_512 = _mm512_cvtxph_ps(axi_256h);
+		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
+		sim_512 = _mm512_fmadd_ps(axi_512, bxi_512, sim_512);
+		na_512 = _mm512_fmadd_ps(axi_512, axi_512, na_512);
+		nb_512 = _mm512_fmadd_ps(bxi_512, bxi_512, nb_512);
+	}
+
+	similarity = _mm512_reduce_add_ps(sim_512);
+	norma = _mm512_reduce_add_ps(na_512);
+	normb = _mm512_reduce_add_ps(nb_512);
+
+	/* Use sqrt(a * b) over sqrt(a) * sqrt(b) */
+	return (double) similarity / sqrt((double) norma * (double) normb);
+}
+
+TARGET_AVX512FP16 float
+HalfvecL1DistanceAvx512(int dim, half * ax, half * bx)
+{
+	float		distance;
+	int			i;
+	unsigned long mask;
+
+	/* For FP16 computation */
+	__m512h		axi_512h;
+	__m512h		bxi_512h;
+	__m512h		dist_512h = _mm512_setzero_ph();
+	__m512h		dist_512h_temp;
+
+	/* For FP32 computation */
+	__m256h		axi_256h;
+	__m256h		bxi_256h;
+	__m512		axi_512;
+	__m512		bxi_512;
+	__m512		dist_512;
+
+	/* FP16 computation */
+	for (i = 0; i < dim; i += 32)
+	{
+		if (dim - i < 32)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, ax + i));
+			bxi_512h = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_512h = _mm512_loadu_ph(ax + i);
+			bxi_512h = _mm512_loadu_ph(bx + i);
+		}
+		dist_512h_temp = _mm512_add_ph(dist_512h, _mm512_abs_ph(_mm512_sub_ph(axi_512h, bxi_512h)));
+
+		/* if overflow, continue with FP32 */
+		if (HasInfinity(dist_512h_temp))
+			break;
+		else
+			dist_512h = dist_512h_temp;
+	}
+	dist_512 = ConvertToFp32Sum(dist_512h);
+
+	/* FP32 computation */
+	for (; i < dim; i += 16)
+	{
+		if (dim - i < 16)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, ax + i));
+			bxi_256h = _mm256_castsi256_ph(_mm256_maskz_loadu_epi16(mask, bx + i));
+		}
+		else
+		{
+			axi_256h = _mm256_loadu_ph(ax + i);
+			bxi_256h = _mm256_loadu_ph(bx + i);
+		}
+		axi_512 = _mm512_cvtxph_ps(axi_256h);
+		bxi_512 = _mm512_cvtxph_ps(bxi_256h);
+		dist_512 = _mm512_add_ps(dist_512, _mm512_abs_ps(_mm512_sub_ps(axi_512, bxi_512)));
+	}
+
+	distance = _mm512_reduce_add_ps(dist_512);
+
+	return distance;
+}
+
+TARGET_AVX512FP16 void
+Float4ToHalfVectorAvx512(Vector * vec, HalfVector * result)
+{
+	unsigned long mask;
+	__m512		vec_512;
+	__m256h		vec_256h;
+	__mmask16 	vec_512_inf;
+	__mmask16 	vec_256h_inf;
+
+	for (int i = 0; i < vec->dim; i += 16)
+	{
+		if (vec->dim - i < 16)
+		{
+			mask = (1 << (vec->dim - i)) - 1;
+			vec_512 = _mm512_maskz_loadu_ps(mask, vec->x + i);
+			vec_256h = _mm512_cvtxps_ph(vec_512);
+			_mm256_mask_storeu_epi16(result->x + i, mask, _mm256_castph_si256(vec_256h));
+		}
+		else
+		{
+			vec_512 = _mm512_loadu_ps(vec->x + i);
+			vec_256h = _mm512_cvtxps_ph(vec_512);
+			_mm256_storeu_ph(result->x + i, vec_256h);
+		}
+
+		/* Test for positive and negative infinity */
+		vec_512_inf = _mm512_fpclass_ps_mask(vec_512, 0x08 + 0x10);
+		vec_256h_inf = _mm256_fpclass_ph_mask(vec_256h, 0x08 + 0x10);
+		if (unlikely(vec_512_inf != vec_256h_inf))
+		{
+			float num;
+			char* buf;
+
+			__mmask16 diff = _kxor_mask16(vec_512_inf, vec_256h_inf);
+			/* Find first element in vector to overflow after conversion (first bit set) */
+			int count = 0;
+			while (diff % 2 == 0) {
+				diff >>= 1;
+				count++;
+			}
+			num = vec->x[i + count];
+
+			/* TODO Avoid duplicate code in Float4ToHalf */
+			buf = palloc(FLOAT_SHORTEST_DECIMAL_LEN);
+
+			float_to_shortest_decimal_buf(num, buf);
+
+			ereport(ERROR,
+					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+					 errmsg("\"%s\" is out of range for type halfvec", buf)));
+		}
+	}
+}
+
+#define CPU_FEATURE_OSXSAVE     (1 << 27)
+#define CPU_FEATURE_AVX512F     (1 << 16)
+#define CPU_FEATURE_AVX512DQ    (1 << 17)
+#define CPU_FEATURE_AVX512_FP16 (1 << 23)
+#define CPU_FEATURE_AVX512BW    (1 << 30)
+#define CPU_FEATURE_AVX512VL    (1 << 31)
+
+#ifdef _MSC_VER
+#define TARGET_XSAVE
+#else
+#define TARGET_XSAVE __attribute__((target("xsave")))
+#endif
+
+TARGET_XSAVE static bool
+SupportsOsXsave()
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#else
+	__cpuid(exx, 1);
+#endif
+
+	return (exx[2] & CPU_FEATURE_OSXSAVE) == CPU_FEATURE_OSXSAVE;
+}
+
+TARGET_XSAVE bool
+SupportsAvx512Fp16()
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+	/* AVX512 features required:
+	 *  AVX512F : sub/fma/add instructions
+	 *  AVX512DQ: _mm512_extracti32x8_epi32
+	 *  AVX512VL: _mm256_loadu_ph
+	 *  AVX512BW: masked loads
+	 */
+	unsigned int features = CPU_FEATURE_AVX512F |
+	                        CPU_FEATURE_AVX512DQ |
+	                        CPU_FEATURE_AVX512VL |
+	                        CPU_FEATURE_AVX512BW;
+
+	/* Check OS supports XSAVE */
+	if (!SupportsOsXsave())
+		return false;
+
+	/* Check XMM, YMM, and ZMM registers are enabled */
+	if ((_xgetbv(0) & 0xe6) != 0xe6)
+		return false;
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 7, 0);
+#endif
+
+	if ((exx[1] & features) != features)
+		return false;
+
+	return (exx[3] & CPU_FEATURE_AVX512_FP16) == CPU_FEATURE_AVX512_FP16;
+}
+
+#endif
+#endif
diff --git a/src/halfutils_avx512.h b/src/halfutils_avx512.h
new file mode 100644
index 000000000..9f94f11dc
--- /dev/null
+++ b/src/halfutils_avx512.h
@@ -0,0 +1,25 @@
+#ifndef HALFUTILS_AVX512_H
+#define HALFUTILS_AVX512_H
+
+#ifdef USE_AVX512
+#include "postgres.h"
+#include "halfvec.h"
+#include "vector.h"
+
+#if (defined(__GNUC__) && (__GNUC__ >= 12)) || \
+	(defined(__clang__) && (__clang_major__ >= 16)) || \
+	(defined __AVX512FP16__)
+#define HAVE_AVX512FP16
+#endif
+
+#ifdef HAVE_AVX512FP16
+extern float HalfvecL2SquaredDistanceAvx512(int dim, half * ax, half * bx);
+extern float HalfvecInnerProductAvx512(int dim, half * ax, half * bx);
+extern double HalfvecCosineSimilarityAvx512(int dim, half * ax, half * bx);
+extern float HalfvecL1DistanceAvx512(int dim, half * ax, half * bx);
+extern void Float4ToHalfVectorAvx512(Vector * vec, HalfVector * result);
+
+extern bool SupportsAvx512Fp16(void);
+#endif
+#endif
+#endif