itzmeanjan · itzmeanjan · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/.github/workflows/test_ci.yml b/.github/workflows/test_ci.yml
@@ -81,3 +81,28 @@ jobs:
           for exe in build/sha3_224_example build/sha3_256_example build/sha3_384_example build/sha3_512_example build/shake128_example build/shake256_example build/turboshake128_example build/turboshake256_example; do
             ./$exe
           done
+
+  test-avx2:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        compiler: [g++, clang++]
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Configure (AVX2)
+        run: >
+          cmake -B build
+          -DCMAKE_CXX_COMPILER=${{ matrix.compiler }}
+          -DCMAKE_BUILD_TYPE=Release
+          -DSHA3_BUILD_TESTS=ON
+          -DSHA3_BUILD_BENCHMARKS=ON
+          -DSHA3_FETCH_DEPS=ON
+          -DSHA3_NATIVE_OPT=ON
+
+      - name: Build
+        run: cmake --build build -j
+
+      - name: Test
+        run: ctest --test-dir build --output-on-failure -j
diff --git a/benches/bench_hashing.cpp b/benches/bench_hashing.cpp
@@ -25,7 +25,7 @@ bench_sha3_224(benchmark::State& state)
     benchmark::ClobberMemory();
   }
 
-  const size_t bytes_processed = state.iterations() * (msg.size() + sha3_224::DIGEST_LEN);
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + sha3_224::DIGEST_LEN);
   state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
 
 #ifdef CYCLES_PER_BYTE
@@ -50,7 +50,7 @@ bench_sha3_256(benchmark::State& state)
     benchmark::ClobberMemory();
   }
 
-  const size_t bytes_processed = state.iterations() * (msg.size() + sha3_256::DIGEST_LEN);
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + sha3_256::DIGEST_LEN);
   state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
 
 #ifdef CYCLES_PER_BYTE
@@ -75,7 +75,7 @@ bench_sha3_384(benchmark::State& state)
     benchmark::ClobberMemory();
   }
 
-  const size_t bytes_processed = state.iterations() * (msg.size() + sha3_384::DIGEST_LEN);
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + sha3_384::DIGEST_LEN);
   state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
 
 #ifdef CYCLES_PER_BYTE
@@ -100,7 +100,7 @@ bench_sha3_512(benchmark::State& state)
     benchmark::ClobberMemory();
   }
 
-  const size_t bytes_processed = state.iterations() * (msg.size() + sha3_512::DIGEST_LEN);
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + sha3_512::DIGEST_LEN);
   state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
 
 #ifdef CYCLES_PER_BYTE

diff --git a/benches/bench_keccak.cpp b/benches/bench_keccak.cpp
@@ -3,6 +3,10 @@
 #include <benchmark/benchmark.h>
 #include <cstdint>
 
+#if defined(__AVX2__)
+#include "sha3/internals/keccak_x4.hpp"
+#endif
+
 namespace {
 
 // Benchmarks Keccak-p[1600, 12] or Keccak-p[1600, 24] permutation.
@@ -20,15 +24,53 @@ bench_keccak_permutation(benchmark::State& state)
     benchmark::ClobberMemory();
   }
 
-  const size_t bytes_processed = state.iterations() * sizeof(st);
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * sizeof(st);
+  state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
+
+#ifdef CYCLES_PER_BYTE
+  state.counters["CYCLES/ BYTE"] = state.counters["CYCLES"] / static_cast<double>(bytes_processed);
+#endif
+}
+
+#if defined(__AVX2__)
+
+// Benchmarks 4-way parallel Keccak-p[1600, 12] or Keccak-p[1600, 24] permutation using AVX2.
+template<size_t num_rounds>
+void
+bench_keccak_x4_permutation(benchmark::State& state)
+{
+  using vec = keccak_x4::vec;
+
+  std::array<vec, keccak::LANE_CNT> st{};
+  for (auto& lane : st) {
+    std::array<uint64_t, 4> tmp{};
+    generate_random_data<uint64_t>(tmp);
+    lane = vec::load(tmp);
+  }
+
+  for (auto _ : state) {
+    keccak_x4::permute<num_rounds>(st);
+
+    benchmark::DoNotOptimize(st);
+    benchmark::ClobberMemory();
+  }
+
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * sizeof(st);
   state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
 
 #ifdef CYCLES_PER_BYTE
   state.counters["CYCLES/ BYTE"] = state.counters["CYCLES"] / static_cast<double>(bytes_processed);
 #endif
 }
 
+#endif
+
 }
 
 BENCHMARK(bench_keccak_permutation<12>)->Name("keccak-p[1600, 12]")->ComputeStatistics("min", compute_min)->ComputeStatistics("max", compute_max);
 BENCHMARK(bench_keccak_permutation<24>)->Name("keccak-p[1600, 24]")->ComputeStatistics("min", compute_min)->ComputeStatistics("max", compute_max);
+
+#if defined(__AVX2__)
+BENCHMARK(bench_keccak_x4_permutation<12>)->Name("keccak-p[1600, 12] x4/avx2")->ComputeStatistics("min", compute_min)->ComputeStatistics("max", compute_max);
+BENCHMARK(bench_keccak_x4_permutation<24>)->Name("keccak-p[1600, 24] x4/avx2")->ComputeStatistics("min", compute_min)->ComputeStatistics("max", compute_max);
+#endif
diff --git a/benches/bench_xof.cpp b/benches/bench_xof.cpp
@@ -6,6 +6,11 @@
 #include <benchmark/benchmark.h>
 #include <cstdint>
 
+#if defined(__AVX2__)
+#include "sha3/shake128_x4.hpp"
+#include "sha3/shake256_x4.hpp"
+#endif
+
 namespace {
 
 /**
@@ -37,7 +42,7 @@ bench_shake128(benchmark::State& state)
     benchmark::ClobberMemory();
   }
 
-  const size_t bytes_processed = state.iterations() * (msg.size() + out.size());
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + out.size());
   state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
 
 #ifdef CYCLES_PER_BYTE
@@ -74,7 +79,7 @@ bench_shake256(benchmark::State& state)
     benchmark::ClobberMemory();
   }
 
-  const size_t bytes_processed = state.iterations() * (msg.size() + out.size());
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + out.size());
   state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
 
 #ifdef CYCLES_PER_BYTE
@@ -111,7 +116,7 @@ bench_turboshake128(benchmark::State& state)
     benchmark::ClobberMemory();
   }
 
-  const size_t bytes_processed = state.iterations() * (msg.size() + out.size());
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + out.size());
   state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
 
 #ifdef CYCLES_PER_BYTE
@@ -148,14 +153,106 @@ bench_turboshake256(benchmark::State& state)
     benchmark::ClobberMemory();
   }
 
-  const size_t bytes_processed = state.iterations() * (msg.size() + out.size());
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + out.size());
   state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
 
 #ifdef CYCLES_PER_BYTE
   state.counters["CYCLES/ BYTE"] = state.counters["CYCLES"] / static_cast<double>(bytes_processed);
 #endif
 }
 
+#if defined(__AVX2__)
+
+// Benchmarks 4-way parallel SHAKE128 XOF using AVX2.
+void
+bench_shake128_x4(benchmark::State& state)
+{
+  const auto mlen = static_cast<size_t>(state.range(0));
+  const auto olen = static_cast<size_t>(state.range(1));
+
+  std::vector<uint8_t> msg0(mlen);
+  std::vector<uint8_t> msg1(mlen);
+  std::vector<uint8_t> msg2(mlen);
+  std::vector<uint8_t> msg3(mlen);
+
+  std::vector<uint8_t> out0(olen);
+  std::vector<uint8_t> out1(olen);
+  std::vector<uint8_t> out2(olen);
+  std::vector<uint8_t> out3(olen);
+
+  generate_random_data<uint8_t>(msg0);
+  generate_random_data<uint8_t>(msg1);
+  generate_random_data<uint8_t>(msg2);
+  generate_random_data<uint8_t>(msg3);
+
+  for (auto _ : state) {
+    shake128_x4::shake128_x4_t hasher;
+    hasher.absorb(msg0, msg1, msg2, msg3);
+    hasher.finalize();
+    hasher.squeeze(out0, out1, out2, out3);
+
+    benchmark::DoNotOptimize(hasher);
+    benchmark::DoNotOptimize(out0);
+    benchmark::DoNotOptimize(out1);
+    benchmark::DoNotOptimize(out2);
+    benchmark::DoNotOptimize(out3);
+    benchmark::ClobberMemory();
+  }
+
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * 4 * (mlen + olen);
+  state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
+
+#ifdef CYCLES_PER_BYTE
+  state.counters["CYCLES/ BYTE"] = state.counters["CYCLES"] / static_cast<double>(bytes_processed);
+#endif
+}
+
+// Benchmarks 4-way parallel SHAKE256 XOF using AVX2.
+void
+bench_shake256_x4(benchmark::State& state)
+{
+  const auto mlen = static_cast<size_t>(state.range(0));
+  const auto olen = static_cast<size_t>(state.range(1));
+
+  std::vector<uint8_t> msg0(mlen);
+  std::vector<uint8_t> msg1(mlen);
+  std::vector<uint8_t> msg2(mlen);
+  std::vector<uint8_t> msg3(mlen);
+
+  std::vector<uint8_t> out0(olen);
+  std::vector<uint8_t> out1(olen);
+  std::vector<uint8_t> out2(olen);
+  std::vector<uint8_t> out3(olen);
+
+  generate_random_data<uint8_t>(msg0);
+  generate_random_data<uint8_t>(msg1);
+  generate_random_data<uint8_t>(msg2);
+  generate_random_data<uint8_t>(msg3);
+
+  for (auto _ : state) {
+    shake256_x4::shake256_x4_t hasher;
+    hasher.absorb(msg0, msg1, msg2, msg3);
+    hasher.finalize();
+    hasher.squeeze(out0, out1, out2, out3);
+
+    benchmark::DoNotOptimize(hasher);
+    benchmark::DoNotOptimize(out0);
+    benchmark::DoNotOptimize(out1);
+    benchmark::DoNotOptimize(out2);
+    benchmark::DoNotOptimize(out3);
+    benchmark::ClobberMemory();
+  }
+
+  const size_t bytes_processed = static_cast<size_t>(state.iterations()) * 4 * (mlen + olen);
+  state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));
+
+#ifdef CYCLES_PER_BYTE
+  state.counters["CYCLES/ BYTE"] = state.counters["CYCLES"] / static_cast<double>(bytes_processed);
+#endif
+}
+
+#endif
+
 }
 
 BENCHMARK(bench_shake128)
@@ -178,3 +275,16 @@ BENCHMARK(bench_turboshake256)
   ->Name("turboshake256")
   ->ComputeStatistics("min", compute_min)
   ->ComputeStatistics("max", compute_max);
+
+#if defined(__AVX2__)
+BENCHMARK(bench_shake128_x4)
+  ->ArgsProduct({ benchmark::CreateRange(64, 16384, 4), { 64 } })
+  ->Name("shake128_x4/avx2")
+  ->ComputeStatistics("min", compute_min)
+  ->ComputeStatistics("max", compute_max);
+BENCHMARK(bench_shake256_x4)
+  ->ArgsProduct({ benchmark::CreateRange(64, 16384, 4), { 64 } })
+  ->Name("shake256_x4/avx2")
+  ->ComputeStatistics("min", compute_min)
+  ->ComputeStatistics("max", compute_max);
+#endif