Incorrect LIBDEFLATE_BAD_DATA because of _mm512_dpbusd_epi32 behaving incorrectly on some CPUs on a AMD EPYC 9384X 32-Core host.

Thank you for libdeflate.

This is not a bug report for libdeflate.  This is a bug experienced while using libdeflate.

I have 2 CPUs (25 and 89 on a AMD EPYC 9384X) behaving incorrectly when running ```_mm512_dpbusd_epi32```, just on one host.
From: https://github.com/ebiggers/libdeflate/blob/28e88e9c6f6c197594753e7c9139e8a70c1a7b89/lib/x86/adler32_template.h#L369
This resulted in libdeflate failing to decompress with ```LIBDEFLATE_BAD_DATA```.

I am wondering if anyone else has seen similar problems?
Maybe this issue could help anyone else seeing similar problems.

A workaround was to disable using the intrinsic instruction(s):
```
cmake -DCMAKE_C_FLAGS="-DLIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX512VNNI=1" ...
```

Reproducer using libdeflate:

```
> cat libdeflate_512_dpbusd.c
#include <libdeflate.h>
#include <stdio.h>
#include <stdlib.h> // malloc
#define _GNU_SOURCE
#include <sched.h> // sched_getcpu
int sched_getcpu();

// gcc -g -O0 libdeflate_512_dpbusd.c -I$HOME/libdeflate/libdeflate-1.25/ -L$HOME/libdeflate/libdeflate-1.25/build -ldeflate -o libdeflate_512_dpbusd
// LD_LIBRARY_PATH=$HOME/libdeflate/libdeflate-1.25/build ./libdeflate_512_dpbusd

// Seen on one AMD EPYC 9384X 32-Core Processor host, model 17, cpu family 25, microcode 0xa101253, gcc 8.5.0 on AlmaLinux 8.10 4.18.0-553.120.1.el8_10.x86_64
// export LD_LIBRARY_PATH=$HOME/libdeflate/libdeflate-1.25/build ; seq 0 127 | xargs -I{} numactl --physcpubind={} ./libdeflate_512_dpbusd | grep failed
// CPU 25 Decompression failed result=1 LIBDEFLATE_BAD_DATA
// CPU 89 Decompression failed result=1 LIBDEFLATE_BAD_DATA

// The cause of the failure is _mm512_dpbusd_epi32 / VDPBUSD on these 2 CPUs, from https://github.com/ebiggers/libdeflate/blob/28e88e9c6f6c197594753e7c9139e8a70c1a7b89/lib/x86/adler32_template.h#L369
// See test_dpbusd.c

int main() {
    unsigned char compressed_data[] = "\x78\x01\x73\x0c\x75\x61\x70\x0a\xf2\x61\x70\x76\x74\x61\x70\xf6\x8b\x64\xf8\x0f\x04\xae\xa1\x41\x0c\xee\x4e\x01\x0c\x9e\x7e\x41\x0c\x5e\x01\x91\x\
0c\xbe\x91\x41\x0c\x7e\x51\x2e\x60\x39\x90\x3c\x3a\x08\xf1\x70\x82\xcb\x85\x06\x23\xd4\x45\x39\x06\xc1\xc5\xd1\xf5\x50\x93\x0f\x00\xfc\xa1\x9b\x19\x00\x00\x00";
    size_t compressed_size = sizeof(compressed_data);
    size_t uncompressed_size = 196;
    char *out_buffer = malloc(uncompressed_size);

    struct libdeflate_decompressor *decompressor = libdeflate_alloc_decompressor();
    if (!decompressor) return 1;

    enum libdeflate_result result = libdeflate_zlib_decompress(decompressor,
        compressed_data, compressed_size, out_buffer, uncompressed_size, NULL);

    const int cpu = sched_getcpu();
    if (result == LIBDEFLATE_SUCCESS) {
      printf("CPU %d Decompressed success\n", cpu);
    } else {
      fprintf(stderr, "CPU %d Decompression failed result=%d %s\n", cpu, result,
                      result==LIBDEFLATE_BAD_DATA ? "LIBDEFLATE_BAD_DATA" : "");
    }

    libdeflate_free_decompressor(decompressor);
    free(out_buffer);
    return 0;
}
```

Reproducer without libdeflate, calling ```_mm512_dpbusd_epi32```

```
> cat test_dpbusd.c
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
#define _GNU_SOURCE
#include <sched.h> // sched_getcpu
int sched_getcpu();

// gcc --version
// gcc (GCC) 8.5.0 20210514 (Red Hat 8.5.0-28)

// uname -a
// Linux REMOVED 4.18.0-553.120.1.el8_10.x86_64 #1 SMP Mon Apr 20 18:04:27 EDT 2026 x86_64 x86_64 x86_64 GNU/Linux

// egrep '(model name)|stepping|microcode|(cpu family)|model|flags' /proc/cpuinfo | head -n 6
// cpu family      : 25
// model           : 17
// model name      : AMD EPYC 9384X 32-Core Processor
// stepping        : 2
// microcode       : 0xa101253
// flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good amd_lbr_v2 nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba perfmon_v2 ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd amd_ppin cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif x2avic v_spec_ctrl avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid overflow_recov succor smca fsrm flush_l1d

// -O2 fails every time, as does -O0, but -O1 sometimes does not fail
// gcc -mavx512f -mavx512vnni -O2 test_dpbusd.c -o test_dpbusd
// ./test_dpbusd
// numactl --physcpubind=25 ./test_dpbusd
// seq 0 127 | xargs -I{} numactl --physcpubind={} ./test_dpbusd | grep FAIL
// CPU 25 TEST FAILED: Results do not match!
// CPU 89 TEST FAILED: Results do not match!

// Helper function to print __m512i as 64-bit integers
void print_vec_i64(__m512i v, const char* name) {
    int64_t arr[8];
    _mm512_storeu_si512(arr, v);
    printf("%s: ", name);
    for (int i = 0; i < 8; i++) {
        printf("%ld ", arr[i]);
    }
    printf("\n");
}

// Helper function to print __m512i as 32-bit integers
void print_vec_i32(__m512i v, const char* name) {
    int32_t arr[16];
    _mm512_storeu_si512(arr, v);
    printf("%s: ", name);
    for (int i = 0; i < 16; i++) {
        printf("%d ", arr[i]);
    }
    printf("\n");
}

// Manual computation of _mm512_dpbusd_epi32
// dpbusd: Dot Product of Bytes (Unsigned and Signed) and accumulate to Dword
// For each dword (32-bit): dst[i] += src1[4i+0]*src2[4i+0] + src1[4i+1]*src2[4i+1] +
//                                   src1[4i+2]*src2[4i+2] + src1[4i+3]*src2[4i+3]
// where src1 bytes are unsigned, src2 bytes are signed
__m512i manual_dpbusd(__m512i dst, __m512i src1, __m512i src2) {
    uint8_t s1[64];
    int8_t s2[64];
    int32_t d[16];

    _mm512_storeu_si512(s1, src1);
    _mm512_storeu_si512(s2, src2);
    _mm512_storeu_si512(d, dst);

    for (int i = 0; i < 16; i++) {
        int sum = 0;
        for (int j = 0; j < 4; j++) {
            sum += (int)s1[i*4 + j] * (int)s2[i*4 + j];
        }
        d[i] += sum;
    }

    return _mm512_loadu_si512(d);
}

int main() {
    #define vec_t __m512i

    vec_t v_s2  = _mm512_setr_epi64(304882548649473, 702957297498669, 661412078891339,
                                     271961624295744, 583329573380896, 1038265394263940,
                                     968171528097900, 898077661810860);
    vec_t data  = _mm512_setr_epi64(-1, -1, -1, -1, -1, -1, -1, -1);
    vec_t mults = _mm512_setr_epi64(4413034230074983236, 3834312847370369852, 3255591464665756468,
                                     2676870081961143084, 2098148699256529700, 1519427316551916316,
                                     940705933847302932, 361984551142689548);

    printf("Initial values:\n");
    print_vec_i64(v_s2, "v_s2 ");
    print_vec_i64(data, "data ");
    print_vec_i64(mults, "mults");
    printf("\n");

    // Create copies for testing
    vec_t v_s2_intrinsic = v_s2;
    vec_t v_s2_manual = v_s2;

    // Test with intrinsic
    v_s2_intrinsic = _mm512_dpbusd_epi32(v_s2_intrinsic, data, mults);

    // Test with manual implementation
    v_s2_manual = manual_dpbusd(v_s2_manual, data, mults);

    printf("Results as 32-bit integers:\n");
    print_vec_i32(v_s2_intrinsic, "Intrinsic");
    print_vec_i32(v_s2_manual, "Manual   ");
    printf("\n");

    printf("Results as 64-bit integers:\n");
    print_vec_i64(v_s2_intrinsic, "Intrinsic");
    print_vec_i64(v_s2_manual, "Manual   ");
    printf("\n");

    // Compare results
    int32_t intrinsic_arr[16], manual_arr[16];
    _mm512_storeu_si512(intrinsic_arr, v_s2_intrinsic);
    _mm512_storeu_si512(manual_arr, v_s2_manual);

    int all_match = 1;
    for (int i = 0; i < 16; i++) {
        if (intrinsic_arr[i] != manual_arr[i]) {
            printf("Mismatch at dword %d: intrinsic=%d, manual=%d\n",
                   i, intrinsic_arr[i], manual_arr[i]);
            all_match = 0;
        }
    }

    const int cpu = sched_getcpu();
    if (all_match) {
      printf("CPU %d TEST PASSED: Intrinsic and manual implementations match!\n", cpu);
        return 0;
    } else {
      printf("CPU %d TEST FAILED: Results do not match!\n", cpu);
        return 1;
    }
}
```

Example failure:

```
> numactl --physcpubind=25 ./test_dpbusd
Initial values:
v_s2 : 304882548649473 702957297498669 661412078891339 271961624295744 583329573380896 1038265394263940 968171528097900 898077661810860
data : -1 -1 -1 -1 -1 -1 -1 -1
mults: 4413034230074983236 3834312847370369852 3255591464665756468 2676870081961143084 2098148699256529700 1519427316551916316 940705933847302932 361984551142689548

Results as 32-bit integers:
Intrinsic: 243447 134736 222019 218240 260737 201427 189078 102591 175254 166927 155930 263670 252450 240210 227970 215730
Manual   : 243447 134736 222019 219260 260737 201427 189078 102591 175254 166927 155930 264690 252450 240210 227970 215730

Results as 64-bit integers:
Intrinsic: 578686713837303 937333662901059 865122377792129 440624990053014 716946005994646 1132454027092250 1031694094424610 926553294994050
Manual   : 578686713837303 941714529542979 865122377792129 440624990053014 716946005994646 1136834893734170 1031694094424610 926553294994050

Mismatch at dword 3: intrinsic=218240, manual=219260
Mismatch at dword 11: intrinsic=263670, manual=264690
CPU 25 TEST FAILED: Results do not match!
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Incorrect LIBDEFLATE_BAD_DATA because of _mm512_dpbusd_epi32 behaving incorrectly on some CPUs on a AMD EPYC 9384X 32-Core host. #451

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Incorrect LIBDEFLATE_BAD_DATA because of _mm512_dpbusd_epi32 behaving incorrectly on some CPUs on a AMD EPYC 9384X 32-Core host. #451

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions