klmckeig · klmckeig · Aug 13, 2025 · Apr 11, 2025 · Oct 21, 2025 · Oct 23, 2025
diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@ EXTVERSION = 0.8.1
 MODULE_big = vector
 DATA = $(wildcard sql/*--*--*.sql)
 DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql
-OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o
+OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vectorutils.o src/vector.o
 HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
 
 TESTS = $(wildcard test/sql/*.sql)

diff --git a/src/vector.c b/src/vector.c
@@ -21,6 +21,7 @@
 #include "utils/lsyscache.h"
 #include "utils/numeric.h"
 #include "vector.h"
+#include "vectorutils.h"
 
 #if PG_VERSION_NUM >= 160000
 #include "varatt.h"
@@ -48,6 +49,7 @@ PGDLLEXPORT void _PG_init(void);
 void
 _PG_init(void)
 {
+	VectorInit();
 	BitvecInit();
 	HalfvecInit();
 	HnswInit();
@@ -946,22 +948,8 @@ binary_quantize(PG_FUNCTION_ARGS)
 	float	   *ax = a->x;
 	VarBit	   *result = InitBitVector(a->dim);
 	unsigned char *rx = VARBITS(result);
-	int			i = 0;
-	int			count = (a->dim / 8) * 8;
 
-	/* Auto-vectorized */
-	for (; i < count; i += 8)
-	{
-		unsigned char result_byte = 0;
-
-		for (int j = 0; j < 8; j++)
-			result_byte |= (ax[i + j] > 0) << (7 - j);
-
-		rx[i / 8] = result_byte;
-	}
-
-	for (; i < a->dim; i++)
-		rx[i / 8] |= (ax[i] > 0) << (7 - (i % 8));
+	BinaryQuantize(a->dim, ax, rx);
 
 	PG_RETURN_VARBIT_P(result);
 }

diff --git a/src/vectorutils.c b/src/vectorutils.c
@@ -0,0 +1,201 @@
+#include "postgres.h"
+
+#include "vectorutils.h"
+#include "halfvec.h"			/* for USE_DISPATCH and USE_TARGET_CLONES */
+
+#if defined(USE_DISPATCH)
+#define VECTOR_DISPATCH
+#endif
+
+#ifdef VECTOR_DISPATCH
+#include <immintrin.h>
+
+#if defined(USE__GET_CPUID)
+#include <cpuid.h>
+#else
+#include <intrin.h>
+#endif
+
+#ifdef _MSC_VER
+#define TARGET_AVX512
+#define TARGET_AVX512_GFNI
+#else
+#define TARGET_AVX512 __attribute__((target("avx512f")))
+#define TARGET_AVX512_GFNI __attribute__((target("avx512f,avx512vl,gfni")))
+#endif
+#endif
+
+void		(*BinaryQuantize) (int dim, float *ax, unsigned char *rx);
+
+static void
+BinaryQuantizeDefault(int dim, float *ax, unsigned char *rx) {
+	int			i;
+	int			count = (dim / 8) * 8;
+	unsigned char result_byte;
+
+	for (i = 0; i < count; i += 8)
+	{
+		result_byte = 0;
+		for (int j = 0; j < 8; j++)
+			result_byte |= (ax[i + j] > 0) << (7 - j);
+		rx[i / 8] = result_byte;
+	}
+	for (; i < dim; i++)
+		rx[i / 8] |= (ax[i] > 0) << (7 - (i % 8));
+}
+
+#ifdef VECTOR_DISPATCH
+TARGET_AVX512 static inline void
+BinaryQuantizeAvx512Compare(int dim, float *ax, unsigned char *rx) {
+	int				rx_bytes = 0;
+	unsigned long	mask;
+	__m512			axi_512;
+	__m512			zero_512 = _mm512_setzero_ps();
+	__mmask16		cmp;
+
+	for (int i = 0; i < dim; i += 16)
+	{
+		if (dim - i < 16)
+		{
+			mask = (1 << (dim - i)) - 1;
+			axi_512 = _mm512_maskz_loadu_ps(mask, ax + i);
+			cmp = _mm512_cmp_ps_mask(axi_512, zero_512, _CMP_GT_OQ);
+			if (dim - i > 8)
+				*((uint16_t*)(rx + rx_bytes)) = cmp;
+			else {
+				*((uint8_t*)(rx + rx_bytes)) = (uint8_t)(cmp & 0xFF);
+			}
+		}
+		else
+		{
+			axi_512 = _mm512_loadu_ps(ax + i);
+			cmp = _mm512_cmp_ps_mask(axi_512, zero_512, _CMP_GT_OQ);
+			*((uint16_t*)(rx + rx_bytes)) = cmp;
+			rx_bytes += 2;
+		}
+	}
+}
+
+static const uint8_t bit_invert_lookup[16] = {
+	0x0, 0x8, 0x4, 0xC, 0x2, 0xA, 0x6, 0xE,
+	0x1, 0x9, 0x5, 0xD, 0x3, 0xB, 0x7, 0xF
+};
+
+TARGET_AVX512 static void
+BinaryQuantizeAvx512(int dim, float *ax, unsigned char *rx) {
+	int	rx_bytes = 0;
+
+	BinaryQuantizeAvx512Compare(dim, ax, rx);
+
+	rx_bytes = dim / 8;
+	if (dim % 8 > 0)
+		rx_bytes++;
+	for (int i = 0; i < rx_bytes; i++)
+		rx[i] = (bit_invert_lookup[rx[i] & 0b1111] << 4) | bit_invert_lookup[rx[i] >> 4];
+}
+
+/* For GFNI instructions to invert bit order refer to
+ * Galois Field New Instructions (GFNI) Technology Guide
+ * https://builders.intel.com/docs/networkbuilders/galois-field-new-instructions-gfni-technology-guide-1-1639042826.pdf
+ */
+#define GFNI_REVBIT		0x8040201008040201
+
+TARGET_AVX512_GFNI static void
+BinaryQuantizeAvx512Gfni(int dim, float *ax, unsigned char *rx) {
+	int			rx_bytes = 0;
+	__m128i		revbit = _mm_set1_epi64x(GFNI_REVBIT);
+	__m128i		rxi;
+	__m128i		rxirev;
+	int			count;
+	int			i = 0;
+
+	BinaryQuantizeAvx512Compare(dim, ax, rx);
+
+	rx_bytes = dim / 8;
+	if (dim % 8 > 0)
+		rx_bytes++;
+	count = (rx_bytes/16)*16;
+	for (; i < count; i += 16)
+	{
+		rxi = _mm_loadu_epi64(rx + i);
+		rxirev = _mm_gf2p8affine_epi64_epi8(rxi, revbit, 0);
+		_mm_storeu_epi64(rx + i, rxirev);
+	}
+	for (; i < rx_bytes; i++)
+		rx[i] =(bit_invert_lookup[rx[i] & 0b1111] << 4) | bit_invert_lookup[rx[i] >> 4];
+}
+
+#define CPU_FEATURE_OSXSAVE  (1 << 27)
+#define CPU_FEATURE_AVX512F  (1 << 16)
+#define CPU_FEATURE_AVX512VL (1 << 31)
+#define CPU_FEATURE_GFNI     (1 << 8)
+
+#ifdef _MSC_VER
+#define TARGET_XSAVE
+#else
+#define TARGET_XSAVE __attribute__((target("xsave")))
+#endif
+
+TARGET_XSAVE static bool
+SupportsOsXsave()
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#else
+	__cpuid(exx, 1);
+#endif
+
+	return (exx[2] & CPU_FEATURE_OSXSAVE) == CPU_FEATURE_OSXSAVE;
+}
+
+TARGET_XSAVE static bool
+SupportsAvx512(unsigned int feature)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+	/* Check OS supports XSAVE */
+	if (!SupportsOsXsave())
+		return false;
+
+	/* Check XMM, YMM, and ZMM registers are enabled */
+	if ((_xgetbv(0) & 0xe6) != 0xe6)
+		return false;
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 7, 0);
+#endif
+
+	return (exx[1] & feature) == feature;
+}
+
+TARGET_XSAVE static bool
+SupportsGfni()
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 7, 0);
+#endif
+
+	return (exx[2] & CPU_FEATURE_GFNI) == CPU_FEATURE_GFNI;
+}
+#endif
+
+void
+VectorInit(void)
+{
+	BinaryQuantize = BinaryQuantizeDefault;
+
+#ifdef VECTOR_DISPATCH
+	if (SupportsAvx512(CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512VL) && SupportsGfni())
+		BinaryQuantize = BinaryQuantizeAvx512Gfni;
+	else if (SupportsAvx512(CPU_FEATURE_AVX512F))
+		BinaryQuantize = BinaryQuantizeAvx512;
+#endif
+}
diff --git a/src/vectorutils.h b/src/vectorutils.h
@@ -0,0 +1,8 @@
+#ifndef VECTORUTILS_H
+#define VECTORUTILS_H
+
+extern void (*BinaryQuantize) (int dim, float *ax, unsigned char *rx);
+
+void		VectorInit(void);
+
+#endif