Thank you for libdeflate.
This is not a bug report for libdeflate. This is a bug experienced while using libdeflate.
I have 2 CPUs (25 and 89 on a AMD EPYC 9384X) behaving incorrectly when running _mm512_dpbusd_epi32, just on one host.
From:
|
v_s2 = VDPBUSD(v_s2, data, mults); |
This resulted in libdeflate failing to decompress with
LIBDEFLATE_BAD_DATA.
I am wondering if anyone else has seen similar problems?
Maybe this issue could help anyone else seeing similar problems.
A workaround was to disable using the intrinsic instruction(s):
cmake -DCMAKE_C_FLAGS="-DLIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX512VNNI=1" ...
Reproducer using libdeflate:
> cat libdeflate_512_dpbusd.c
#include <libdeflate.h>
#include <stdio.h>
#include <stdlib.h> // malloc
#define _GNU_SOURCE
#include <sched.h> // sched_getcpu
int sched_getcpu();
// gcc -g -O0 libdeflate_512_dpbusd.c -I$HOME/libdeflate/libdeflate-1.25/ -L$HOME/libdeflate/libdeflate-1.25/build -ldeflate -o libdeflate_512_dpbusd
// LD_LIBRARY_PATH=$HOME/libdeflate/libdeflate-1.25/build ./libdeflate_512_dpbusd
// Seen on one AMD EPYC 9384X 32-Core Processor host, model 17, cpu family 25, microcode 0xa101253, gcc 8.5.0 on AlmaLinux 8.10 4.18.0-553.120.1.el8_10.x86_64
// export LD_LIBRARY_PATH=$HOME/libdeflate/libdeflate-1.25/build ; seq 0 127 | xargs -I{} numactl --physcpubind={} ./libdeflate_512_dpbusd | grep failed
// CPU 25 Decompression failed result=1 LIBDEFLATE_BAD_DATA
// CPU 89 Decompression failed result=1 LIBDEFLATE_BAD_DATA
// The cause of the failure is _mm512_dpbusd_epi32 / VDPBUSD on these 2 CPUs, from https://github.com/ebiggers/libdeflate/blob/28e88e9c6f6c197594753e7c9139e8a70c1a7b89/lib/x86/adler32_template.h#L369
// See test_dpbusd.c
int main() {
unsigned char compressed_data[] = "\x78\x01\x73\x0c\x75\x61\x70\x0a\xf2\x61\x70\x76\x74\x61\x70\xf6\x8b\x64\xf8\x0f\x04\xae\xa1\x41\x0c\xee\x4e\x01\x0c\x9e\x7e\x41\x0c\x5e\x01\x91\x\
0c\xbe\x91\x41\x0c\x7e\x51\x2e\x60\x39\x90\x3c\x3a\x08\xf1\x70\x82\xcb\x85\x06\x23\xd4\x45\x39\x06\xc1\xc5\xd1\xf5\x50\x93\x0f\x00\xfc\xa1\x9b\x19\x00\x00\x00";
size_t compressed_size = sizeof(compressed_data);
size_t uncompressed_size = 196;
char *out_buffer = malloc(uncompressed_size);
struct libdeflate_decompressor *decompressor = libdeflate_alloc_decompressor();
if (!decompressor) return 1;
enum libdeflate_result result = libdeflate_zlib_decompress(decompressor,
compressed_data, compressed_size, out_buffer, uncompressed_size, NULL);
const int cpu = sched_getcpu();
if (result == LIBDEFLATE_SUCCESS) {
printf("CPU %d Decompressed success\n", cpu);
} else {
fprintf(stderr, "CPU %d Decompression failed result=%d %s\n", cpu, result,
result==LIBDEFLATE_BAD_DATA ? "LIBDEFLATE_BAD_DATA" : "");
}
libdeflate_free_decompressor(decompressor);
free(out_buffer);
return 0;
}
Reproducer without libdeflate, calling _mm512_dpbusd_epi32
> cat test_dpbusd.c
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
#define _GNU_SOURCE
#include <sched.h> // sched_getcpu
int sched_getcpu();
// gcc --version
// gcc (GCC) 8.5.0 20210514 (Red Hat 8.5.0-28)
// uname -a
// Linux REMOVED 4.18.0-553.120.1.el8_10.x86_64 #1 SMP Mon Apr 20 18:04:27 EDT 2026 x86_64 x86_64 x86_64 GNU/Linux
// egrep '(model name)|stepping|microcode|(cpu family)|model|flags' /proc/cpuinfo | head -n 6
// cpu family : 25
// model : 17
// model name : AMD EPYC 9384X 32-Core Processor
// stepping : 2
// microcode : 0xa101253
// flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good amd_lbr_v2 nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba perfmon_v2 ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd amd_ppin cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif x2avic v_spec_ctrl avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid overflow_recov succor smca fsrm flush_l1d
// -O2 fails every time, as does -O0, but -O1 sometimes does not fail
// gcc -mavx512f -mavx512vnni -O2 test_dpbusd.c -o test_dpbusd
// ./test_dpbusd
// numactl --physcpubind=25 ./test_dpbusd
// seq 0 127 | xargs -I{} numactl --physcpubind={} ./test_dpbusd | grep FAIL
// CPU 25 TEST FAILED: Results do not match!
// CPU 89 TEST FAILED: Results do not match!
// Helper function to print __m512i as 64-bit integers
void print_vec_i64(__m512i v, const char* name) {
int64_t arr[8];
_mm512_storeu_si512(arr, v);
printf("%s: ", name);
for (int i = 0; i < 8; i++) {
printf("%ld ", arr[i]);
}
printf("\n");
}
// Helper function to print __m512i as 32-bit integers
void print_vec_i32(__m512i v, const char* name) {
int32_t arr[16];
_mm512_storeu_si512(arr, v);
printf("%s: ", name);
for (int i = 0; i < 16; i++) {
printf("%d ", arr[i]);
}
printf("\n");
}
// Manual computation of _mm512_dpbusd_epi32
// dpbusd: Dot Product of Bytes (Unsigned and Signed) and accumulate to Dword
// For each dword (32-bit): dst[i] += src1[4i+0]*src2[4i+0] + src1[4i+1]*src2[4i+1] +
// src1[4i+2]*src2[4i+2] + src1[4i+3]*src2[4i+3]
// where src1 bytes are unsigned, src2 bytes are signed
__m512i manual_dpbusd(__m512i dst, __m512i src1, __m512i src2) {
uint8_t s1[64];
int8_t s2[64];
int32_t d[16];
_mm512_storeu_si512(s1, src1);
_mm512_storeu_si512(s2, src2);
_mm512_storeu_si512(d, dst);
for (int i = 0; i < 16; i++) {
int sum = 0;
for (int j = 0; j < 4; j++) {
sum += (int)s1[i*4 + j] * (int)s2[i*4 + j];
}
d[i] += sum;
}
return _mm512_loadu_si512(d);
}
int main() {
#define vec_t __m512i
vec_t v_s2 = _mm512_setr_epi64(304882548649473, 702957297498669, 661412078891339,
271961624295744, 583329573380896, 1038265394263940,
968171528097900, 898077661810860);
vec_t data = _mm512_setr_epi64(-1, -1, -1, -1, -1, -1, -1, -1);
vec_t mults = _mm512_setr_epi64(4413034230074983236, 3834312847370369852, 3255591464665756468,
2676870081961143084, 2098148699256529700, 1519427316551916316,
940705933847302932, 361984551142689548);
printf("Initial values:\n");
print_vec_i64(v_s2, "v_s2 ");
print_vec_i64(data, "data ");
print_vec_i64(mults, "mults");
printf("\n");
// Create copies for testing
vec_t v_s2_intrinsic = v_s2;
vec_t v_s2_manual = v_s2;
// Test with intrinsic
v_s2_intrinsic = _mm512_dpbusd_epi32(v_s2_intrinsic, data, mults);
// Test with manual implementation
v_s2_manual = manual_dpbusd(v_s2_manual, data, mults);
printf("Results as 32-bit integers:\n");
print_vec_i32(v_s2_intrinsic, "Intrinsic");
print_vec_i32(v_s2_manual, "Manual ");
printf("\n");
printf("Results as 64-bit integers:\n");
print_vec_i64(v_s2_intrinsic, "Intrinsic");
print_vec_i64(v_s2_manual, "Manual ");
printf("\n");
// Compare results
int32_t intrinsic_arr[16], manual_arr[16];
_mm512_storeu_si512(intrinsic_arr, v_s2_intrinsic);
_mm512_storeu_si512(manual_arr, v_s2_manual);
int all_match = 1;
for (int i = 0; i < 16; i++) {
if (intrinsic_arr[i] != manual_arr[i]) {
printf("Mismatch at dword %d: intrinsic=%d, manual=%d\n",
i, intrinsic_arr[i], manual_arr[i]);
all_match = 0;
}
}
const int cpu = sched_getcpu();
if (all_match) {
printf("CPU %d TEST PASSED: Intrinsic and manual implementations match!\n", cpu);
return 0;
} else {
printf("CPU %d TEST FAILED: Results do not match!\n", cpu);
return 1;
}
}
Example failure:
> numactl --physcpubind=25 ./test_dpbusd
Initial values:
v_s2 : 304882548649473 702957297498669 661412078891339 271961624295744 583329573380896 1038265394263940 968171528097900 898077661810860
data : -1 -1 -1 -1 -1 -1 -1 -1
mults: 4413034230074983236 3834312847370369852 3255591464665756468 2676870081961143084 2098148699256529700 1519427316551916316 940705933847302932 361984551142689548
Results as 32-bit integers:
Intrinsic: 243447 134736 222019 218240 260737 201427 189078 102591 175254 166927 155930 263670 252450 240210 227970 215730
Manual : 243447 134736 222019 219260 260737 201427 189078 102591 175254 166927 155930 264690 252450 240210 227970 215730
Results as 64-bit integers:
Intrinsic: 578686713837303 937333662901059 865122377792129 440624990053014 716946005994646 1132454027092250 1031694094424610 926553294994050
Manual : 578686713837303 941714529542979 865122377792129 440624990053014 716946005994646 1136834893734170 1031694094424610 926553294994050
Mismatch at dword 3: intrinsic=218240, manual=219260
Mismatch at dword 11: intrinsic=263670, manual=264690
CPU 25 TEST FAILED: Results do not match!
Thank you for libdeflate.
This is not a bug report for libdeflate. This is a bug experienced while using libdeflate.
I have 2 CPUs (25 and 89 on a AMD EPYC 9384X) behaving incorrectly when running
_mm512_dpbusd_epi32, just on one host.From:
libdeflate/lib/x86/adler32_template.h
Line 369 in 28e88e9
This resulted in libdeflate failing to decompress with
LIBDEFLATE_BAD_DATA.I am wondering if anyone else has seen similar problems?
Maybe this issue could help anyone else seeing similar problems.
A workaround was to disable using the intrinsic instruction(s):
Reproducer using libdeflate:
Reproducer without libdeflate, calling
_mm512_dpbusd_epi32Example failure: