Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/workflows/test_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,28 @@ jobs:
for exe in build/sha3_224_example build/sha3_256_example build/sha3_384_example build/sha3_512_example build/shake128_example build/shake256_example build/turboshake128_example build/turboshake256_example; do
./$exe
done

test-avx2:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
compiler: [g++, clang++]
steps:
- uses: actions/checkout@v6

- name: Configure (AVX2)
run: >
cmake -B build
-DCMAKE_CXX_COMPILER=${{ matrix.compiler }}
-DCMAKE_BUILD_TYPE=Release
-DSHA3_BUILD_TESTS=ON
-DSHA3_BUILD_BENCHMARKS=ON
-DSHA3_FETCH_DEPS=ON
-DSHA3_NATIVE_OPT=ON

- name: Build
run: cmake --build build -j

- name: Test
run: ctest --test-dir build --output-on-failure -j
8 changes: 4 additions & 4 deletions benches/bench_hashing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ bench_sha3_224(benchmark::State& state)
benchmark::ClobberMemory();
}

const size_t bytes_processed = state.iterations() * (msg.size() + sha3_224::DIGEST_LEN);
const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + sha3_224::DIGEST_LEN);
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
Expand All @@ -50,7 +50,7 @@ bench_sha3_256(benchmark::State& state)
benchmark::ClobberMemory();
}

const size_t bytes_processed = state.iterations() * (msg.size() + sha3_256::DIGEST_LEN);
const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + sha3_256::DIGEST_LEN);
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
Expand All @@ -75,7 +75,7 @@ bench_sha3_384(benchmark::State& state)
benchmark::ClobberMemory();
}

const size_t bytes_processed = state.iterations() * (msg.size() + sha3_384::DIGEST_LEN);
const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + sha3_384::DIGEST_LEN);
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
Expand All @@ -100,7 +100,7 @@ bench_sha3_512(benchmark::State& state)
benchmark::ClobberMemory();
}

const size_t bytes_processed = state.iterations() * (msg.size() + sha3_512::DIGEST_LEN);
const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + sha3_512::DIGEST_LEN);
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
Expand Down
44 changes: 43 additions & 1 deletion benches/bench_keccak.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
#include <benchmark/benchmark.h>
#include <cstdint>

#if defined(__AVX2__)
#include "sha3/internals/keccak_x4.hpp"
#endif

namespace {

// Benchmarks Keccak-p[1600, 12] or Keccak-p[1600, 24] permutation.
Expand All @@ -20,15 +24,53 @@ bench_keccak_permutation(benchmark::State& state)
benchmark::ClobberMemory();
}

const size_t bytes_processed = state.iterations() * sizeof(st);
const size_t bytes_processed = static_cast<size_t>(state.iterations()) * sizeof(st);
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
state.counters["CYCLES/ BYTE"] = state.counters["CYCLES"] / static_cast<double>(bytes_processed);
#endif
}

#if defined(__AVX2__)

// Benchmarks 4-way parallel Keccak-p[1600, 12] or Keccak-p[1600, 24] permutation using AVX2.
template<size_t num_rounds>
void
bench_keccak_x4_permutation(benchmark::State& state)
{
using vec = keccak_x4::vec;

std::array<vec, keccak::LANE_CNT> st{};
for (auto& lane : st) {
std::array<uint64_t, 4> tmp{};
generate_random_data<uint64_t>(tmp);
lane = vec::load(tmp);
}

for (auto _ : state) {
keccak_x4::permute<num_rounds>(st);

benchmark::DoNotOptimize(st);
benchmark::ClobberMemory();
}

const size_t bytes_processed = static_cast<size_t>(state.iterations()) * sizeof(st);
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
state.counters["CYCLES/ BYTE"] = state.counters["CYCLES"] / static_cast<double>(bytes_processed);
#endif
}

#endif

}

BENCHMARK(bench_keccak_permutation<12>)->Name("keccak-p[1600, 12]")->ComputeStatistics("min", compute_min)->ComputeStatistics("max", compute_max);
BENCHMARK(bench_keccak_permutation<24>)->Name("keccak-p[1600, 24]")->ComputeStatistics("min", compute_min)->ComputeStatistics("max", compute_max);

#if defined(__AVX2__)
BENCHMARK(bench_keccak_x4_permutation<12>)->Name("keccak-p[1600, 12] x4/avx2")->ComputeStatistics("min", compute_min)->ComputeStatistics("max", compute_max);
BENCHMARK(bench_keccak_x4_permutation<24>)->Name("keccak-p[1600, 24] x4/avx2")->ComputeStatistics("min", compute_min)->ComputeStatistics("max", compute_max);
#endif
118 changes: 114 additions & 4 deletions benches/bench_xof.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
#include <benchmark/benchmark.h>
#include <cstdint>

#if defined(__AVX2__)
#include "sha3/shake128_x4.hpp"
#include "sha3/shake256_x4.hpp"
#endif

namespace {

/**
Expand Down Expand Up @@ -37,7 +42,7 @@ bench_shake128(benchmark::State& state)
benchmark::ClobberMemory();
}

const size_t bytes_processed = state.iterations() * (msg.size() + out.size());
const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + out.size());
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
Expand Down Expand Up @@ -74,7 +79,7 @@ bench_shake256(benchmark::State& state)
benchmark::ClobberMemory();
}

const size_t bytes_processed = state.iterations() * (msg.size() + out.size());
const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + out.size());
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
Expand Down Expand Up @@ -111,7 +116,7 @@ bench_turboshake128(benchmark::State& state)
benchmark::ClobberMemory();
}

const size_t bytes_processed = state.iterations() * (msg.size() + out.size());
const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + out.size());
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
Expand Down Expand Up @@ -148,14 +153,106 @@ bench_turboshake256(benchmark::State& state)
benchmark::ClobberMemory();
}

const size_t bytes_processed = state.iterations() * (msg.size() + out.size());
const size_t bytes_processed = static_cast<size_t>(state.iterations()) * (msg.size() + out.size());
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
state.counters["CYCLES/ BYTE"] = state.counters["CYCLES"] / static_cast<double>(bytes_processed);
#endif
}

#if defined(__AVX2__)

// Benchmarks 4-way parallel SHAKE128 XOF using AVX2.
void
bench_shake128_x4(benchmark::State& state)
{
const auto mlen = static_cast<size_t>(state.range(0));
const auto olen = static_cast<size_t>(state.range(1));

std::vector<uint8_t> msg0(mlen);
std::vector<uint8_t> msg1(mlen);
std::vector<uint8_t> msg2(mlen);
std::vector<uint8_t> msg3(mlen);

std::vector<uint8_t> out0(olen);
std::vector<uint8_t> out1(olen);
std::vector<uint8_t> out2(olen);
std::vector<uint8_t> out3(olen);

generate_random_data<uint8_t>(msg0);
generate_random_data<uint8_t>(msg1);
generate_random_data<uint8_t>(msg2);
generate_random_data<uint8_t>(msg3);

for (auto _ : state) {
shake128_x4::shake128_x4_t hasher;
hasher.absorb(msg0, msg1, msg2, msg3);
hasher.finalize();
hasher.squeeze(out0, out1, out2, out3);

benchmark::DoNotOptimize(hasher);
benchmark::DoNotOptimize(out0);
benchmark::DoNotOptimize(out1);
benchmark::DoNotOptimize(out2);
benchmark::DoNotOptimize(out3);
benchmark::ClobberMemory();
}

const size_t bytes_processed = static_cast<size_t>(state.iterations()) * 4 * (mlen + olen);
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
state.counters["CYCLES/ BYTE"] = state.counters["CYCLES"] / static_cast<double>(bytes_processed);
#endif
}

// Benchmarks 4-way parallel SHAKE256 XOF using AVX2.
void
bench_shake256_x4(benchmark::State& state)
{
const auto mlen = static_cast<size_t>(state.range(0));
const auto olen = static_cast<size_t>(state.range(1));

std::vector<uint8_t> msg0(mlen);
std::vector<uint8_t> msg1(mlen);
std::vector<uint8_t> msg2(mlen);
std::vector<uint8_t> msg3(mlen);

std::vector<uint8_t> out0(olen);
std::vector<uint8_t> out1(olen);
std::vector<uint8_t> out2(olen);
std::vector<uint8_t> out3(olen);

generate_random_data<uint8_t>(msg0);
generate_random_data<uint8_t>(msg1);
generate_random_data<uint8_t>(msg2);
generate_random_data<uint8_t>(msg3);

for (auto _ : state) {
shake256_x4::shake256_x4_t hasher;
hasher.absorb(msg0, msg1, msg2, msg3);
hasher.finalize();
hasher.squeeze(out0, out1, out2, out3);

benchmark::DoNotOptimize(hasher);
benchmark::DoNotOptimize(out0);
benchmark::DoNotOptimize(out1);
benchmark::DoNotOptimize(out2);
benchmark::DoNotOptimize(out3);
benchmark::ClobberMemory();
}

const size_t bytes_processed = static_cast<size_t>(state.iterations()) * 4 * (mlen + olen);
state.SetBytesProcessed(static_cast<int64_t>(bytes_processed));

#ifdef CYCLES_PER_BYTE
state.counters["CYCLES/ BYTE"] = state.counters["CYCLES"] / static_cast<double>(bytes_processed);
#endif
}

#endif

}

BENCHMARK(bench_shake128)
Expand All @@ -178,3 +275,16 @@ BENCHMARK(bench_turboshake256)
->Name("turboshake256")
->ComputeStatistics("min", compute_min)
->ComputeStatistics("max", compute_max);

#if defined(__AVX2__)
BENCHMARK(bench_shake128_x4)
->ArgsProduct({ benchmark::CreateRange(64, 16384, 4), { 64 } })
->Name("shake128_x4/avx2")
->ComputeStatistics("min", compute_min)
->ComputeStatistics("max", compute_max);
BENCHMARK(bench_shake256_x4)
->ArgsProduct({ benchmark::CreateRange(64, 16384, 4), { 64 } })
->Name("shake256_x4/avx2")
->ComputeStatistics("min", compute_min)
->ComputeStatistics("max", compute_max);
#endif
Loading