From 1b93d0c64906fc7e85f8de90b72717370a08a84b Mon Sep 17 00:00:00 2001
From: Rac <rac75116@gmail.com>
Date: Sun, 4 Jan 2026 00:48:36 +0900
Subject: [PATCH 1/2] Add copilot instructions and tasks configuration; enhance
 benchmark output formatting

---
 .github/copilot-instructions.md | 41 +++++++++++++++++++++++++++
 .vscode/tasks.json              | 50 +++++++++++++++++++++++++++++++++
 benchmarks/isprime_bench.cpp    | 28 +++++++++++++-----
 3 files changed, 112 insertions(+), 7 deletions(-)
 create mode 100644 .github/copilot-instructions.md
 create mode 100644 .vscode/tasks.json

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 0000000..7808cf2
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,41 @@
+This is a C++ header-only library called "libcpprime", intended for fast primality testing of 64-bit integers.
+
+
+The library itself and its test code should be written to work with compilers supporting C++11.
+Benchmark and other temporary files may use the latest C++ features.
+The library implementation should pay particular attention to compatibility with older compilers.
+
+
+Supported compilers include gcc, clang, msvc, clang-cl, and the gcc and clang versions within mingw.
+
+
+When you want to run tests or benchmarks, execute the tasks described in tasks.json.
+For detailed benchmark results, please refer to benchmarks/bench_summary.md.
+Running tests and benchmarks takes approximately 20-30 seconds.
+
+
+When optimizing code, primarily use gcc or msvc for benchmarks.
+However, to avoid significant speed differences between compilers, run them with clang and clang-cl once you have finished the initial implementation.
+
+Please inform users of any breaking changes.
+
+
+.txt files often contain large amounts of data. Do not read files with the .txt extension.
+
+
+If you wish to generate data mechanically, please create C++ code or Python scripts within the tmp folder.
+
+
+For primality testing in Python, you can use Scipy. For execution speed, please prioritize using PyPy.
+
+
+All code and README.md should be written in English. However, this response uses the language currently used in our chat.
+
+The directory structure is as follows:
+include/libcpprime/ : Main library code
+benchmarks/ : Code for benchmarks
+benchmarks/bench_* : Benchmark results
+tests/ : Code and test cases for tests
+docs/ : Data used for documentation
+tmp/ : Files used for experiments, etc.
+/README.md : README
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
new file mode 100644
index 0000000..f5c40cb
--- /dev/null
+++ b/.vscode/tasks.json
@@ -0,0 +1,50 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Test (gcc)",
+            "type": "shell",
+            "command": "task test:gcc",
+        },
+        {
+            "label": "Test (clang)",
+            "type": "shell",
+            "command": "task test:clang",
+        },
+        {
+            "label": "Test (msvc)",
+            "type": "shell",
+            "command": "task test:msvc",
+        },
+        {
+            "label": "Test (clang-cl)",
+            "type": "shell",
+            "command": "task test:clang-cl",
+        },
+        {
+            "label": "Benchmark (gcc)",
+            "type": "shell",
+            "command": "task bench:gcc",
+        },
+        {
+            "label": "Benchmark (clang)",
+            "type": "shell",
+            "command": "task bench:clang",
+        },
+        {
+            "label": "Benchmark (msvc)",
+            "type": "shell",
+            "command": "task bench:msvc",
+        },
+        {
+            "label": "Benchmark (clang-cl)",
+            "type": "shell",
+            "command": "task bench:all",
+        },
+        {
+            "label": "Generate Documentation",
+            "type": "shell",
+            "command": "task docs",
+        },
+    ]
+}
diff --git a/benchmarks/isprime_bench.cpp b/benchmarks/isprime_bench.cpp
index eb05065..cdc0675 100644
--- a/benchmarks/isprime_bench.cpp
+++ b/benchmarks/isprime_bench.cpp
@@ -6,6 +6,7 @@
 #include <cstdio>
 #include <filesystem>
 #include <fstream>
+#include <iomanip>
 #include <ios>
 #include <iostream>
 #include <libcpprime/IsPrime.hpp>
@@ -31,7 +32,7 @@ int main(int argc, char** argv) {
             weighted[count++] = 64 - i;
         }
     }
-    auto bench = [rng = Rng(42), heavy](bool (*func)(std::uint64_t)) mutable {
+    auto bench = [rng = Rng(), heavy](bool (*func)(std::uint64_t)) mutable {
         std::uint32_t k = weighted[rng.bounded(89440)];
         std::uint64_t n = (rng() >> k) | 1;
         int iters = (heavy ? 300 : 250);
@@ -124,13 +125,26 @@ int main(int argc, char** argv) {
     f_summary << "avg_time_prime_IsPrime,avg_time_prime_IsPrimeNoTable,avg_time_composite_IsPrime,avg_time_composite_IsPrimeNoTable\n";
     f_summary_md << "| Bit Width | IsPrime Avg Time (ns, prime) | IsPrimeNoTable Avg Time (ns, prime) | IsPrime Avg Time (ns, composite) | IsPrimeNoTable Avg Time (ns, composite) |\n";
     f_summary_md << "|-----------|------------------------------|-------------------------------------|----------------------------------|-----------------------------------------|\n";
+    f_summary << std::fixed << std::setprecision(6);
+    f_summary_md << std::fixed << std::setprecision(2);
     for (std::int32_t i = 1; i <= 64; ++i) {
-        std::string avg_prime = count_prime[i] ? std::to_string(time_prime_sum[i] / count_prime[i]) : "nan";
-        std::string avg_prime_NoTable = count_prime_NoTable[i] ? std::to_string(time_prime_sum_NoTable[i] / count_prime_NoTable[i]) : "nan";
-        std::string avg_composite = count_composite[i] ? std::to_string(time_composite_sum[i] / count_composite[i]) : "nan";
-        std::string avg_composite_NoTable = count_composite_NoTable[i] ? std::to_string(time_composite_sum_NoTable[i] / count_composite_NoTable[i]) : "nan";
-        f_summary << avg_prime << "," << avg_prime_NoTable << "," << avg_composite << "," << avg_composite_NoTable << "\n";
-        f_summary_md << "| " << i << " | " << avg_prime << " | " << avg_prime_NoTable << " | " << avg_composite << " | " << avg_composite_NoTable << " |\n";
+        auto print_result = [](std::ofstream& f, double val, std::int32_t count) -> std::ofstream& {
+            if (count) {
+                f << (val / count);
+            } else {
+                f << "nan";
+            }
+            return f;
+        };
+        print_result(f_summary, time_prime_sum[i], count_prime[i]) << ",";
+        print_result(f_summary, time_prime_sum_NoTable[i], count_prime_NoTable[i]) << ",";
+        print_result(f_summary, time_composite_sum[i], count_composite[i]) << ",";
+        print_result(f_summary, time_composite_sum_NoTable[i], count_composite_NoTable[i]) << "\n";
+        f_summary_md << "| " << i << " | ";
+        print_result(f_summary_md, time_prime_sum[i], count_prime[i]) << " | ";
+        print_result(f_summary_md, time_prime_sum_NoTable[i], count_prime_NoTable[i]) << " | ";
+        print_result(f_summary_md, time_composite_sum[i], count_composite[i]) << " | ";
+        print_result(f_summary_md, time_composite_sum_NoTable[i], count_composite_NoTable[i]) << " |\n";
     }
     f_summary << std::flush;
     f_summary_md << std::flush;

From 28aae704e2010cf7bfb18f9c43ef002367c4a5e0 Mon Sep 17 00:00:00 2001
From: Rac <rac75116@gmail.com>
Date: Sun, 4 Jan 2026 01:29:35 +0900
Subject: [PATCH 2/2] Improve performance

---
 README.md                                     |  2 ++
 benchmarks/isprime_bench.cpp                  |  2 +-
 include/libcpprime/IsPrime.hpp                |  8 +++---
 include/libcpprime/IsPrimeNoTable.hpp         | 10 +++----
 include/libcpprime/internal/IsPrimeCommon.hpp | 26 +++++++++----------
 5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index cb51725..2815588 100644
--- a/README.md
+++ b/README.md
@@ -144,6 +144,8 @@ Benchmarks are executed on GitHub Actions.
 
 ## Releases
 
+-   2026/01/04 ver 1.3.2
+    - Improve performance
 -   2025/12/24 ver 1.3.1
     - Improve performance and reduce binary size for `cppr::IsPrime`
 -   2025/12/21 ver 1.3.0
diff --git a/benchmarks/isprime_bench.cpp b/benchmarks/isprime_bench.cpp
index cdc0675..bc48c8c 100644
--- a/benchmarks/isprime_bench.cpp
+++ b/benchmarks/isprime_bench.cpp
@@ -32,7 +32,7 @@ int main(int argc, char** argv) {
             weighted[count++] = 64 - i;
         }
     }
-    auto bench = [rng = Rng(), heavy](bool (*func)(std::uint64_t)) mutable {
+    auto bench = [rng = Rng(100), heavy](bool (*func)(std::uint64_t)) mutable {
         std::uint32_t k = weighted[rng.bounded(89440)];
         std::uint64_t n = (rng() >> k) | 1;
         int iters = (heavy ? 300 : 250);
diff --git a/include/libcpprime/IsPrime.hpp b/include/libcpprime/IsPrime.hpp
index 80360e3..5cbea0a 100644
--- a/include/libcpprime/IsPrime.hpp
+++ b/include/libcpprime/IsPrime.hpp
@@ -41,14 +41,14 @@ constexpr std::uint64_t FlagTable17[1024] = {
 #include "internal/IsPrimeTable17.txt"
 };
 // Bitset for odd numbers < 2^17 (2 is handled explicitly).
-CPPR_INTERNAL_CONSTEXPR bool IsPrime17(const std::uint64_t n) noexcept { return n == 2 || (n % 2 == 1 && (FlagTable17[n / 128] & (1ull << (n % 128 / 2)))); }
+CPPR_INTERNAL_CONSTEXPR_INLINE bool IsPrime17(const std::uint64_t n) noexcept { return n == 2 || (n % 2 == 1 && (FlagTable17[n / 128] & (1ull << (n % 128 / 2)))); }
 
 constexpr std::uint16_t Bases64[16384] = {
 #include "internal/IsPrimeBases64.txt"
 };
 // Deterministic base selection via a multiplicative hash (fast table lookup).
-CPPR_INTERNAL_CONSTEXPR std::uint16_t GetBase(std::uint64_t x) noexcept { return Bases64[(0xad625b89u * static_cast<std::uint32_t>(x)) >> 18]; }
-CPPR_INTERNAL_CONSTEXPR bool IsPrime49(const std::uint64_t x) noexcept {
+CPPR_INTERNAL_CONSTEXPR_INLINE std::uint16_t GetBase(std::uint64_t x) noexcept { return Bases64[(0xad625b89u * static_cast<std::uint32_t>(x)) >> 18]; }
+CPPR_INTERNAL_CONSTEXPR_INLINE bool IsPrime49(const std::uint64_t x) noexcept {
     const MontgomeryModint64Impl<false> mint(x);
     const std::int32_t S = CountrZero(x - 1);
     const std::uint64_t D = (x - 1) >> S;
@@ -91,7 +91,7 @@ CPPR_INTERNAL_CONSTEXPR bool IsPrime49(const std::uint64_t x) noexcept {
     return res1 && res2;
 }
 template <bool Strict>
-CPPR_INTERNAL_CONSTEXPR bool IsPrime64(const std::uint64_t x) noexcept {
+CPPR_INTERNAL_CONSTEXPR_INLINE bool IsPrime64(const std::uint64_t x) noexcept {
     const MontgomeryModint64Impl<Strict> mint(x);
     const std::int32_t S = CountrZero(x - 1);
     const std::uint64_t D = (x - 1) >> S;
diff --git a/include/libcpprime/IsPrimeNoTable.hpp b/include/libcpprime/IsPrimeNoTable.hpp
index e11144b..85c26b3 100644
--- a/include/libcpprime/IsPrimeNoTable.hpp
+++ b/include/libcpprime/IsPrimeNoTable.hpp
@@ -23,9 +23,9 @@ constexpr std::uint32_t FlagTable10[32] = {
 #include "internal/IsPrimeTable10.txt"
 };
 // Bitset for small n < 1024.
-CPPR_INTERNAL_CONSTEXPR bool IsPrime10(const std::uint64_t n) noexcept { return (FlagTable10[n / 32] >> (n % 32)) & 1; }
+CPPR_INTERNAL_CONSTEXPR_INLINE bool IsPrime10(const std::uint64_t n) noexcept { return (FlagTable10[n / 32] >> (n % 32)) & 1; }
 
-CPPR_INTERNAL_CONSTEXPR bool GCDFilter(const std::uint32_t n) noexcept {
+CPPR_INTERNAL_CONSTEXPR_INLINE bool GCDFilter(const std::uint32_t n) noexcept {
     auto GCD = [](std::uint32_t x, std::uint32_t y) -> std::uint32_t {
         // Binary GCD (Stein's algorithm). Assumes y != 0 when x != 0.
         if (x == 0) return 0;
@@ -56,7 +56,7 @@ CPPR_INTERNAL_CONSTEXPR bool GCDFilter(const std::uint32_t n) noexcept {
     return GCD((a * b) % n, n) == 1;
 }
 
-CPPR_INTERNAL_CONSTEXPR std::uint64_t GetLucasBase(const std::uint64_t x) noexcept {
+CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t GetLucasBase(const std::uint64_t x) noexcept {
     // Chooses a Lucas parameter D for the strong Lucas probable prime test.
     // Returns:
     // - 0: definitely composite (quick checks found a factor or perfect square)
@@ -113,7 +113,7 @@ CPPR_INTERNAL_CONSTEXPR std::uint64_t GetLucasBase(const std::uint64_t x) noexce
     return Z;
 }
 
-CPPR_INTERNAL_CONSTEXPR bool IsPrime64MillerRabin(const std::uint64_t x) noexcept {
+CPPR_INTERNAL_CONSTEXPR_INLINE bool IsPrime64MillerRabin(const std::uint64_t x) noexcept {
     const MontgomeryModint64Impl<false> mint(x);
     const std::int32_t S = CountrZero(x - 1);
     const std::uint64_t D = (x - 1) >> S;
@@ -263,7 +263,7 @@ CPPR_INTERNAL_CONSTEXPR bool IsPrime64MillerRabin(const std::uint64_t x) noexcep
     }
 }
 
-CPPR_INTERNAL_CONSTEXPR bool IsPrime64BailliePSW(const std::uint64_t x) noexcept {
+CPPR_INTERNAL_CONSTEXPR_INLINE bool IsPrime64BailliePSW(const std::uint64_t x) noexcept {
     const MontgomeryModint64Impl<true> mint(x);
     const auto one = mint.one();
     const auto mone = mint.neg(one);
diff --git a/include/libcpprime/internal/IsPrimeCommon.hpp b/include/libcpprime/internal/IsPrimeCommon.hpp
index 80d16d0..38c13d8 100644
--- a/include/libcpprime/internal/IsPrimeCommon.hpp
+++ b/include/libcpprime/internal/IsPrimeCommon.hpp
@@ -40,7 +40,7 @@ template <bool Strict = false>
 class MontgomeryModint64Impl {
     std::uint64_t mod_ = 0, rs = 0, nr = 0, np = 0;
 
-    CPPR_INTERNAL_CONSTEXPR std::uint64_t reduce(const std::uint64_t n) const noexcept {
+    CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t reduce(const std::uint64_t n) const noexcept {
         // Montgomery reduction of a 128-bit value with implicit low half `n`.
         std::uint64_t q = n * nr;
         if CPPR_INTERNAL_IF_CONSTEXPR (Strict) {
@@ -51,7 +51,7 @@ class MontgomeryModint64Impl {
             return mod_ - m;
         }
     }
-    CPPR_INTERNAL_CONSTEXPR std::uint64_t reduce(const std::uint64_t a, const std::uint64_t b) const noexcept {
+    CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t reduce(const std::uint64_t a, const std::uint64_t b) const noexcept {
         // Montgomery reduction of the product a*b.
         auto tmp = Mulu128(a, b);
         std::uint64_t d = tmp.high;
@@ -81,13 +81,13 @@ class MontgomeryModint64Impl {
         for (std::uint32_t i = 0; i != 5; ++i) nr *= 2 - n * nr;
         np = reduce(rs);
     }
-    CPPR_INTERNAL_CONSTEXPR std::uint64_t build(std::uint32_t x) const noexcept { return reduce(x % mod_, rs); }
-    CPPR_INTERNAL_CONSTEXPR std::uint64_t build(std::uint64_t x) const noexcept { return reduce(x % mod_, rs); }
-    CPPR_INTERNAL_CONSTEXPR std::uint64_t raw(std::uint64_t x) const noexcept {
+    CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t build(std::uint32_t x) const noexcept { return reduce(x % mod_, rs); }
+    CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t build(std::uint64_t x) const noexcept { return reduce(x % mod_, rs); }
+    CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t raw(std::uint64_t x) const noexcept {
         Assume(x < mod_);
         return reduce(x, rs);
     }
-    CPPR_INTERNAL_CONSTEXPR std::uint64_t val(std::uint64_t x) const noexcept {
+    CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t val(std::uint64_t x) const noexcept {
         // Converts from Montgomery domain back to the standard residue.
         // Non-strict mode permits values in [0, 2*mod) for faster operations.
         if CPPR_INTERNAL_IF_CONSTEXPR (Strict) {
@@ -99,7 +99,7 @@ class MontgomeryModint64Impl {
             return tmp - mod_ * (tmp >= mod_);
         }
     }
-    CPPR_INTERNAL_CONSTEXPR std::uint64_t one() const noexcept {
+    CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t one() const noexcept {
         if CPPR_INTERNAL_IF_CONSTEXPR (Strict) {
             Assume(np < mod_);
             return np;
@@ -108,7 +108,7 @@ class MontgomeryModint64Impl {
             return np;
         }
     }
-    CPPR_INTERNAL_CONSTEXPR std::uint64_t neg(std::uint64_t x) const noexcept {
+    CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t neg(std::uint64_t x) const noexcept {
         if CPPR_INTERNAL_IF_CONSTEXPR (Strict) {
             Assume(x < mod_);
             return (mod_ - x) * (x != 0);
@@ -117,7 +117,7 @@ class MontgomeryModint64Impl {
             return (2 * mod_ - x) * (x != 0);
         }
     }
-    CPPR_INTERNAL_CONSTEXPR std::uint64_t mul(std::uint64_t x, std::uint64_t y) const noexcept {
+    CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t mul(std::uint64_t x, std::uint64_t y) const noexcept {
         if CPPR_INTERNAL_IF_CONSTEXPR (Strict) {
             Assume(x < mod_ && y < mod_);
             return reduce(x, y);
@@ -126,7 +126,7 @@ class MontgomeryModint64Impl {
             return reduce(x, y);
         }
     }
-    CPPR_INTERNAL_CONSTEXPR bool same(std::uint64_t x, std::uint64_t y) const noexcept {
+    CPPR_INTERNAL_CONSTEXPR_INLINE bool same(std::uint64_t x, std::uint64_t y) const noexcept {
         // Equality check that tolerates the relaxed range in non-strict mode.
         if CPPR_INTERNAL_IF_CONSTEXPR (Strict) {
             Assume(x < mod_ && y < mod_);
@@ -137,7 +137,7 @@ class MontgomeryModint64Impl {
             return (tmp == 0) || (tmp == mod_) || (tmp == 0 - mod_);
         }
     }
-    CPPR_INTERNAL_CONSTEXPR bool is_zero(std::uint64_t x) const noexcept {
+    CPPR_INTERNAL_CONSTEXPR_INLINE bool is_zero(std::uint64_t x) const noexcept {
         if CPPR_INTERNAL_IF_CONSTEXPR (Strict) {
             Assume(x < mod_);
             return x == 0;
@@ -146,7 +146,7 @@ class MontgomeryModint64Impl {
             return x == 0 || x == mod_;
         }
     }
-    CPPR_INTERNAL_CONSTEXPR std::uint64_t add(std::uint64_t x, std::uint64_t y) const noexcept {
+    CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t add(std::uint64_t x, std::uint64_t y) const noexcept {
         if CPPR_INTERNAL_IF_CONSTEXPR (Strict) {
             Assume(x < mod_ && y < mod_);
             return x + y - (x >= mod_ - y) * mod_;
@@ -155,7 +155,7 @@ class MontgomeryModint64Impl {
             return x + y - (x >= 2 * mod_ - y) * (2 * mod_);
         }
     }
-    CPPR_INTERNAL_CONSTEXPR std::uint64_t sub(std::uint64_t x, std::uint64_t y) const noexcept {
+    CPPR_INTERNAL_CONSTEXPR_INLINE std::uint64_t sub(std::uint64_t x, std::uint64_t y) const noexcept {
         if CPPR_INTERNAL_IF_CONSTEXPR (Strict) {
             Assume(x < mod_ && y < mod_);
             return x - y + (x < y) * mod_;