diff --git a/bun.lock b/bun.lock index 76ba3323..b9b77524 100644 --- a/bun.lock +++ b/bun.lock @@ -16,6 +16,7 @@ "@types/node": "^22", "@types/qrcode-terminal": "^0.12.2", "@types/semver": "^7.7.1", + "binpunch": "^1.0.0", "chalk": "^5.6.2", "esbuild": "^0.25.0", "fast-check": "^4.5.3", @@ -299,6 +300,8 @@ "binary-extensions": ["binary-extensions@2.3.0", "", {}, "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw=="], + "binpunch": ["binpunch@1.0.0", "", { "bin": { "binpunch": "dist/cli.js" } }, "sha512-ghxdoerLN3WN64kteDJuL4d9dy7gbvcqoADNRWBk6aQ5FrYH1EmPmREAdcdIdTNAA3uW3V38Env5OqH2lj+i+g=="], + "brace-expansion": ["brace-expansion@2.0.2", "", { "dependencies": { "balanced-match": "^1.0.0" } }, "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ=="], "braces": ["braces@3.0.3", "", { "dependencies": { "fill-range": "^7.1.1" } }, "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA=="], diff --git a/package.json b/package.json index 7a34e65a..ce5c3797 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,6 @@ "dev": "bun run src/bin.ts", "build": "bun run script/build.ts --single", "build:all": "bun run script/build.ts", - "hole-punch": "bun run script/hole-punch.ts", "bundle": "bun run script/bundle.ts", "typecheck": "tsc --noEmit", "lint": "bunx ultracite check", @@ -38,6 +37,7 @@ "@types/node": "^22", "@types/qrcode-terminal": "^0.12.2", "@types/semver": "^7.7.1", + "binpunch": "^1.0.0", "chalk": "^5.6.2", "esbuild": "^0.25.0", "fast-check": "^4.5.3", diff --git a/script/build.ts b/script/build.ts index 4df15105..5297db38 100644 --- a/script/build.ts +++ b/script/build.ts @@ -22,9 +22,9 @@ import { promisify } from "node:util"; import { gzip } from "node:zlib"; +import { processBinary } from "binpunch"; import { $ } from "bun"; import pkg from "../package.json"; -import { processBinary } from "./hole-punch.js"; const gzipAsync = promisify(gzip); diff --git a/script/hole-punch.ts b/script/hole-punch.ts deleted file mode 100644 index 9f7f2206..00000000 --- a/script/hole-punch.ts +++ /dev/null @@ -1,550 +0,0 @@ -#!/usr/bin/env bun -/** - * Post-compile binary hole-punch tool for Bun-compiled executables. - * - * "Punches holes" in the binary by zeroing unused ICU data entries inside - * the embedded ICU data blob. These zeroed regions compress to nearly nothing, - * reducing compressed download size by ~24%. - * - * How it works: - * 1. Scans the binary for the ICU data header (magic bytes 0xda27, type "CmnD") - * 2. Reads the Table of Contents (TOC) to enumerate all data entries - * 3. Zeros data for entries that are safe to remove (converters, CJK dictionaries, - * non-English locale data in subcategories) - * 4. Keeps all root-level entries, normalization files, break iterators, and - * English locale data intact (Bun accesses these at startup/shutdown) - * - * Safety: The TOC structure is left intact — only entry data bytes are zeroed. - * The binary remains valid and all CLI functionality works with clean exits. - * - * Usage: - * bun run script/hole-punch.ts # Modify in-place - * bun run script/hole-punch.ts dist-bin/sentry-* # Glob multiple binaries - * - * Expected savings (linux-x64): - * gzip: ~37 MB -> ~28 MB (24% reduction) - * zstd: ~35 MB -> ~26 MB (24% reduction) - * zstd-19: ~27 MB -> ~21 MB (23% reduction) - */ - -import { existsSync, readFileSync, statSync, writeFileSync } from "node:fs"; - -/** ICU data header magic value (little-endian uint16 at offset 2) */ -const ICU_MAGIC = 0x27_da; - -/** - * ICU data type identifier. - * "CmnD" = Common Data — the monolithic ICU data package format. - */ -const ICU_TYPE_CMND = "CmnD"; - -/** Subcategories where non-essential locale data lives */ -const LOCALE_SUBCATEGORIES = new Set([ - "coll/", - "zone/", - "curr/", - "lang/", - "unit/", - "region/", - "rbnf/", - "translit/", -]); - -/** - * Prefixes for entries within subcategories that must be preserved. - * These contain core data needed by the ICU runtime and Bun. - */ -const KEEP_PREFIXES = [ - "root", - "en", - "res_index", - "pool", - "supplementalData", - "ucadata", - "tzdbNames", -]; - -/** Result of scanning a binary for ICU data */ -type IcuScanResult = { - /** Byte offset where the ICU data blob starts in the binary */ - blobOffset: number; - /** Size of the ICU data header in bytes */ - headerSize: number; - /** Total number of TOC entries */ - entryCount: number; - /** Parsed TOC entries with names and data boundaries */ - entries: IcuEntry[]; - /** ICU version prefix (e.g., "icudt75l") */ - prefix: string; -}; - -/** A single entry in the ICU data TOC */ -type IcuEntry = { - /** Entry name relative to ICU prefix (e.g., "coll/de.res") */ - name: string; - /** Absolute offset of entry data within the binary */ - dataOffset: number; - /** Size of entry data in bytes */ - dataSize: number; - /** Whether this entry should be zeroed */ - shouldRemove: boolean; -}; - -/** Statistics from a hole-punch operation */ -type HolePunchStats = { - totalEntries: number; - removedEntries: number; - keptEntries: number; - bytesZeroed: number; - bytesKept: number; -}; - -/** - * Scan a binary buffer for the ICU data blob header. - * - * The ICU common data format starts with: - * - uint16 headerSize (offset 0) - * - uint16 magic 0xda27 (offset 2) - * - UDataInfo structure starting at offset 4: - * - uint16 size (offset 4) - * - uint16 reserved (offset 6) - * - uint8 isBigEndian (offset 8) - * - uint8 charsetFamily (offset 9) - * - uint8 sizeofUChar (offset 10) - * - uint8 reserved (offset 11) - * - char[4] dataFormat (offset 12, e.g., "CmnD") - * - * @returns Byte offset of the blob, or -1 if not found - */ -function findIcuBlob(buf: Buffer): number { - // Scan for the ICU magic bytes, stepping by 4 (ICU blob is at least 4-byte aligned) - for (let i = 0; i < buf.length - 16; i += 4) { - const magic = buf.readUInt16LE(i + 2); - if (magic !== ICU_MAGIC) { - continue; - } - - // Verify the dataFormat field is "CmnD" (at offset +12 in the header) - const dataFormat = buf.toString("ascii", i + 12, i + 16); - if (dataFormat !== ICU_TYPE_CMND) { - continue; - } - - const headerSize = buf.readUInt16LE(i); - // Header size should be reasonable (typically 64-256 bytes, includes copyright) - if (headerSize < 16 || headerSize > 512) { - continue; - } - - return i; - } - - return -1; -} - -/** - * Read raw TOC entries from the ICU data blob. - * - * Each TOC entry is 8 bytes: uint32 nameOffset + uint32 dataOffset, - * both relative to the TOC start. - */ -function readRawTocEntries( - buf: Buffer, - tocStart: number, - entryCount: number -): { nameOffset: number; dataOffset: number }[] { - const tocEntriesStart = tocStart + 4; - const rawEntries: { nameOffset: number; dataOffset: number }[] = []; - - for (let i = 0; i < entryCount; i += 1) { - const offset = tocEntriesStart + i * 8; - rawEntries.push({ - nameOffset: buf.readUInt32LE(offset), - dataOffset: buf.readUInt32LE(offset + 4), - }); - } - - return rawEntries; -} - -/** - * Read a null-terminated ASCII string from the buffer. - */ -function readNullTerminatedString(buf: Buffer, start: number): string { - let end = start; - while (end < buf.length && buf[end] !== 0) { - end += 1; - } - return buf.toString("ascii", start, end); -} - -/** - * Estimate the data size of the last TOC entry. - * - * The last entry has no successor to measure against, so we estimate - * using twice the average entry size (capped at 64KB). - */ -function estimateLastEntrySize(entries: IcuEntry[]): number { - if (entries.length < 2) { - return 4096; - } - - const firstData = entries[0].dataOffset; - const last = entries.at(-1); - if (!last) { - return 4096; - } - const avgSize = (last.dataOffset - firstData) / (entries.length - 1); - return Math.min(Math.ceil(avgSize * 2), 65_536); -} - -/** - * Parse the ICU data blob's Table of Contents. - * - * After the header, the TOC structure is: - * - uint32 entryCount (at blobOffset + headerSize) - * - For each entry (8 bytes each): - * - uint32 nameOffset (relative to TOC start) - * - uint32 dataOffset (relative to TOC start) - * - Names area (null-terminated strings) - * - Data area (entry data, each aligned to 16 bytes) - */ -function parseIcuToc(buf: Buffer, blobOffset: number): IcuScanResult { - const headerSize = buf.readUInt16LE(blobOffset); - const tocStart = blobOffset + headerSize; - const entryCount = buf.readUInt32LE(tocStart); - - if (entryCount < 100 || entryCount > 10_000) { - throw new Error( - `Unexpected ICU entry count: ${entryCount}. Binary may be corrupted.` - ); - } - - const rawEntries = readRawTocEntries(buf, tocStart, entryCount); - - // Read names and compute data sizes - const entries: IcuEntry[] = []; - let prefix = ""; - - for (let i = 0; i < rawEntries.length; i += 1) { - const raw = rawEntries[i]; - const fullName = readNullTerminatedString(buf, tocStart + raw.nameOffset); - - // Extract ICU prefix from first entry (e.g., "icudt75l/") - if (i === 0) { - const slashIdx = fullName.indexOf("/"); - if (slashIdx !== -1) { - prefix = fullName.substring(0, slashIdx); - } - } - - // Strip prefix (e.g., "icudt75l/coll/de.res" -> "coll/de.res") - const name = prefix ? fullName.substring(prefix.length + 1) : fullName; - - // Data size = distance to next entry's data (or estimated for last entry) - const dataAbsOffset = tocStart + raw.dataOffset; - const dataSize = - i < rawEntries.length - 1 - ? tocStart + rawEntries[i + 1].dataOffset - dataAbsOffset - : 0; // Placeholder for last entry, fixed below - - entries.push({ - name, - dataOffset: dataAbsOffset, - dataSize, - shouldRemove: false, - }); - } - - // Fix last entry size estimate - const lastEntry = entries.at(-1); - if (lastEntry) { - lastEntry.dataSize = estimateLastEntrySize(entries); - } - - return { blobOffset, headerSize, entryCount, entries, prefix }; -} - -/** - * Determine whether an ICU entry should be zeroed. - * - * Safe to remove: - * - `.cnv` files: legacy charset converters (never used in JS/Bun) - * - `.dict` files in `brkitr/`: CJK/Burmese/Khmer break dictionaries - * - Non-essential locale data in subcategories (coll/, zone/, curr/, etc.) - * - * Must keep: - * - All root-level `.res` files (Bun accesses these during shutdown) - * - All `.nrm`, `.icu`, `.cfu`, `.brk`, `.spp` files - * - `res_index.res`, `pool.res` in every subcategory - * - Root and English entries in subcategories - */ -function shouldRemoveEntry(name: string): boolean { - // Legacy charset converters — never used in JS - if (name.endsWith(".cnv")) { - return true; - } - - // CJK/Burmese/Khmer break dictionaries — large, not needed for CLI - if (name.includes("brkitr/") && name.endsWith(".dict")) { - return true; - } - - // Check subcategory locale data - for (const subcat of LOCALE_SUBCATEGORIES) { - if (!name.startsWith(subcat)) { - continue; - } - - const filename = name.substring(subcat.length); - - // Keep essential entries (root, English, indexes, pools, supplemental data) - const shouldKeep = KEEP_PREFIXES.some( - (p) => - filename === p || - filename.startsWith(`${p}.`) || - filename.startsWith(`${p}_`) - ); - - if (!shouldKeep) { - return true; - } - } - - return false; -} - -/** - * Punch holes in a binary buffer by zeroing removable ICU entries in-place. - * - * Zeros data bytes for removable ICU entries while keeping the TOC intact. - * This makes the zeroed regions compress to nearly nothing. - */ -function holePunch(buf: Buffer, scan: IcuScanResult): HolePunchStats { - let removedEntries = 0; - let keptEntries = 0; - let bytesZeroed = 0; - let bytesKept = 0; - - const lastIndex = scan.entries.length - 1; - - for (let i = 0; i < scan.entries.length; i += 1) { - const entry = scan.entries[i]; - entry.shouldRemove = shouldRemoveEntry(entry.name); - - // Skip the last entry: its size is estimated (no successor to measure - // against) and zeroing it could overwrite bytes outside the ICU blob. - // One skipped entry has negligible impact on compression savings. - if (i === lastIndex) { - keptEntries += 1; - bytesKept += entry.dataSize; - continue; - } - - // Clamp data size to not exceed buffer bounds - const safeSize = Math.min(entry.dataSize, buf.length - entry.dataOffset); - if (safeSize <= 0) { - keptEntries += 1; - continue; - } - - if (entry.shouldRemove) { - buf.fill(0, entry.dataOffset, entry.dataOffset + safeSize); - removedEntries += 1; - bytesZeroed += safeSize; - } else { - keptEntries += 1; - bytesKept += safeSize; - } - } - - return { - totalEntries: scan.entryCount, - removedEntries, - keptEntries, - bytesZeroed, - bytesKept, - }; -} - -/** - * Process a single binary file: find ICU data, zero unused entries, write back. - * - * Returns null (rather than throwing) when the binary has no ICU data or - * when the ICU blob has an unexpected layout, so callers like the build - * script can skip hole-punch gracefully instead of crashing. - * - * @returns Hole-punch statistics, or null if no ICU data was found/parseable - */ -function processBinary(filePath: string): HolePunchStats | null { - const buf = readFileSync(filePath); - - const blobOffset = findIcuBlob(buf); - if (blobOffset === -1) { - return null; - } - - let scan: IcuScanResult; - try { - scan = parseIcuToc(buf, blobOffset); - } catch { - // ICU blob matched the magic bytes but has an unexpected layout - // (e.g., entry count out of range). Skip instead of crashing. - return null; - } - - const stats = holePunch(buf, scan); - writeFileSync(filePath, buf); - return stats; -} - -/** Format bytes as a human-readable string */ -function formatSize(bytes: number): string { - if (bytes >= 1024 * 1024) { - return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; - } - if (bytes >= 1024) { - return `${(bytes / 1024).toFixed(1)} KB`; - } - return `${bytes} B`; -} - -// --- Exports for testing --- - -export { - findIcuBlob, - parseIcuToc, - shouldRemoveEntry, - holePunch, - processBinary, - formatSize, - estimateLastEntrySize, - runCli, -}; -export type { IcuScanResult, IcuEntry, HolePunchStats, CliFileResult }; - -// --- CLI Entry Point --- - -/** Result from a single file processed by the CLI */ -type CliFileResult = { - filePath: string; - status: "no_icu" | "no_removable" | "success"; - stats?: HolePunchStats; - originalSize?: number; -}; - -/** - * Run the hole-punch CLI logic. - * - * Extracted from main() so it can be tested in-process without mocking - * process.exit or console output. - * - * @returns Error message string if validation fails, or array of results - */ -function runCli( - args: string[] -): { error: string } | { results: CliFileResult[] } { - const filePaths = args.filter((a) => !a.startsWith("-")); - - if (filePaths.length === 0) { - return { - error: - "Usage: bun run script/hole-punch.ts [--verbose] ...", - }; - } - - // Validate all files exist before processing - for (const filePath of filePaths) { - if (!existsSync(filePath)) { - return { error: `Error: File not found: ${filePath}` }; - } - const stat = statSync(filePath); - if (!stat.isFile()) { - return { error: `Error: Not a file: ${filePath}` }; - } - } - - const results: CliFileResult[] = []; - - for (const filePath of filePaths) { - const originalSize = statSync(filePath).size; - const stats = processBinary(filePath); - - if (!stats) { - results.push({ filePath, status: "no_icu" }); - continue; - } - - if (stats.removedEntries === 0) { - results.push({ filePath, status: "no_removable", stats, originalSize }); - continue; - } - - results.push({ filePath, status: "success", stats, originalSize }); - } - - return { results }; -} - -function main(): void { - const cliArgs = process.argv.slice(2); - const isVerbose = cliArgs.includes("--verbose") || cliArgs.includes("-v"); - const result = runCli(cliArgs); - - if ("error" in result) { - console.error(result.error); - if (result.error.startsWith("Usage:")) { - console.error(""); - console.error( - "Reduces compressed binary size by ~24% by zeroing unused ICU data." - ); - console.error("Modifies binaries in-place."); - } - process.exit(1); - } - - for (const fileResult of result.results) { - if (fileResult.status === "no_icu") { - console.error( - ` Warning: No ICU data found in ${fileResult.filePath}, skipping` - ); - continue; - } - - if (fileResult.status === "no_removable") { - console.log(` ${fileResult.filePath}: no removable entries found`); - continue; - } - - const { stats, originalSize, filePath } = fileResult; - if (!stats) { - continue; - } - - const pct = ( - (stats.bytesZeroed / (stats.bytesZeroed + stats.bytesKept)) * - 100 - ).toFixed(1); - - console.log( - ` ${filePath}: zeroed ${stats.removedEntries}/${stats.totalEntries} ICU entries (${formatSize(stats.bytesZeroed)}, ${pct}% of ICU data)` - ); - - if (isVerbose && originalSize !== undefined) { - console.log(` Raw size: ${formatSize(originalSize)} (unchanged)`); - console.log(` ICU entries kept: ${stats.keptEntries}`); - console.log(` ICU data kept: ${formatSize(stats.bytesKept)}`); - console.log(` ICU data zeroed: ${formatSize(stats.bytesZeroed)}`); - } - } -} - -// Only run CLI when executed directly (not imported for testing) -const isMainModule = - typeof Bun !== "undefined" && "main" in Bun - ? import.meta.path === (Bun as Record).main - : process.argv[1]?.endsWith("hole-punch.ts"); - -if (isMainModule) { - main(); -} diff --git a/test/lib/hole-punch.test.ts b/test/lib/hole-punch.test.ts deleted file mode 100644 index b756371d..00000000 --- a/test/lib/hole-punch.test.ts +++ /dev/null @@ -1,628 +0,0 @@ -import { describe, expect, test } from "bun:test"; -import { mkdtempSync, writeFileSync } from "node:fs"; -import { tmpdir } from "node:os"; -import { join } from "node:path"; -import type { IcuEntry } from "../../script/hole-punch.js"; -import { - estimateLastEntrySize, - findIcuBlob, - formatSize, - holePunch, - parseIcuToc, - processBinary, - runCli, - shouldRemoveEntry, -} from "../../script/hole-punch.js"; - -/** - * Build a synthetic ICU data blob for testing. - * - * Creates a minimal valid ICU common data package with the given entry names. - * Each entry gets 64 bytes of non-zero data (0xff fill) so we can verify - * that zeroing actually happened. - * - * @param prefix ICU version prefix (e.g., "icudt75l") - * @param entryNames Entry names without prefix (e.g., ["root.res", "coll/de.res"]) - * @param prePadding Bytes of padding before the ICU blob (simulates ELF sections) - * @returns Buffer containing the synthetic binary - */ -function buildSyntheticBlob( - prefix: string, - entryNames: string[], - prePadding = 256 -): Buffer { - const entryDataSize = 64; // Each entry gets 64 bytes of data - - // Full entry names include the prefix - const fullNames = entryNames.map((n) => `${prefix}/${n}`); - - // Calculate sizes: - // Header: 32 bytes (padded, includes UDataInfo + some copyright text) - const headerSize = 32; - - // TOC: 4 bytes (count) + 8 bytes per entry (nameOffset + dataOffset) - const tocHeaderSize = 4 + entryNames.length * 8; - - // Names area: all names null-terminated, then padded to 16 bytes - let namesSize = 0; - for (const name of fullNames) { - namesSize += name.length + 1; // +1 for null terminator - } - // Pad names to 16-byte alignment - const namesPadded = Math.ceil(namesSize / 16) * 16; - - // Data area: each entry gets entryDataSize bytes, aligned to 16 - const dataAreaSize = entryNames.length * entryDataSize; - - // Total blob size - const totalSize = - prePadding + headerSize + tocHeaderSize + namesPadded + dataAreaSize; - const buf = Buffer.alloc(totalSize); - - // Fill prePadding with random-ish data (simulates ELF content) - for (let i = 0; i < prePadding; i += 1) { - buf[i] = (i * 7 + 3) % 256; - } - - const blobStart = prePadding; - - // Write header - buf.writeUInt16LE(headerSize, blobStart); // headerSize - buf.writeUInt16LE(0x27_da, blobStart + 2); // magic - buf.writeUInt16LE(20, blobStart + 4); // UDataInfo.size - buf.writeUInt16LE(0, blobStart + 6); // UDataInfo.reserved - buf[blobStart + 8] = 0; // isBigEndian - buf[blobStart + 9] = 0; // charsetFamily - buf[blobStart + 10] = 2; // sizeofUChar - buf[blobStart + 11] = 0; // reserved - buf.write("CmnD", blobStart + 12, 4, "ascii"); // dataFormat - - // Write TOC - const tocStart = blobStart + headerSize; - buf.writeUInt32LE(entryNames.length, tocStart); // entryCount - - // Compute offsets relative to tocStart - const namesAreaOffset = tocHeaderSize; - const dataAreaOffset = tocHeaderSize + namesPadded; - - let currentNameOffset = namesAreaOffset; - - for (let i = 0; i < entryNames.length; i += 1) { - const tocEntryOffset = tocStart + 4 + i * 8; - - // Write name offset (relative to tocStart) - buf.writeUInt32LE(currentNameOffset, tocEntryOffset); - - // Write data offset (relative to tocStart) - const entryDataOffset = dataAreaOffset + i * entryDataSize; - buf.writeUInt32LE(entryDataOffset, tocEntryOffset + 4); - - // Write the name string - const nameAbsOffset = tocStart + currentNameOffset; - buf.write(fullNames[i], nameAbsOffset, "ascii"); - buf[nameAbsOffset + fullNames[i].length] = 0; // null terminator - - currentNameOffset += fullNames[i].length + 1; - - // Fill entry data with non-zero bytes so we can detect zeroing - const dataAbsOffset = tocStart + entryDataOffset; - buf.fill(0xff, dataAbsOffset, dataAbsOffset + entryDataSize); - } - - return buf; -} - -/** - * Check whether a data region is all zeros. - */ -function isZeroed(buf: Buffer, offset: number, size: number): boolean { - for (let i = 0; i < size; i += 1) { - if (buf[offset + i] !== 0) { - return false; - } - } - return true; -} - -/** - * Check whether a data region is all 0xff (non-zero fill). - */ -function isNonZero(buf: Buffer, offset: number, size: number): boolean { - for (let i = 0; i < size; i += 1) { - if (buf[offset + i] !== 0xff) { - return false; - } - } - return true; -} - -describe("findIcuBlob", () => { - test("finds ICU blob at the correct offset", () => { - const buf = buildSyntheticBlob("icudt75l", ["root.res"], 256); - const offset = findIcuBlob(buf); - expect(offset).toBe(256); - }); - - test("finds ICU blob with different padding", () => { - const buf = buildSyntheticBlob("icudt75l", ["root.res"], 1024); - const offset = findIcuBlob(buf); - expect(offset).toBe(1024); - }); - - test("returns -1 for buffer without ICU data", () => { - const buf = Buffer.alloc(4096); - expect(findIcuBlob(buf)).toBe(-1); - }); - - test("returns -1 for buffer too small", () => { - const buf = Buffer.alloc(8); - expect(findIcuBlob(buf)).toBe(-1); - }); - - test("handles different ICU version prefixes", () => { - const buf = buildSyntheticBlob("icudt80l", ["root.res"], 256); - const offset = findIcuBlob(buf); - expect(offset).toBe(256); - }); -}); - -describe("parseIcuToc", () => { - test("parses entry count correctly", () => { - const names = Array.from({ length: 200 }, (_, i) => `entry${i}.res`); - const buf = buildSyntheticBlob("icudt75l", names); - const offset = findIcuBlob(buf); - const scan = parseIcuToc(buf, offset); - - expect(scan.entryCount).toBe(200); - expect(scan.entries).toHaveLength(200); - }); - - test("extracts ICU prefix from first entry", () => { - // Need at least 100 entries to pass validation - const names = [ - "root.res", - "en.res", - ...Array.from({ length: 100 }, (_, i) => `extra${i}.res`), - ]; - const buf = buildSyntheticBlob("icudt75l", names); - const offset = findIcuBlob(buf); - const scan = parseIcuToc(buf, offset); - - expect(scan.prefix).toBe("icudt75l"); - }); - - test("strips prefix from entry names", () => { - const names = [ - "root.res", - "coll/de.res", - ...Array.from({ length: 100 }, (_, i) => `extra${i}.res`), - ]; - const buf = buildSyntheticBlob("icudt75l", names); - const offset = findIcuBlob(buf); - const scan = parseIcuToc(buf, offset); - - expect(scan.entries[0].name).toBe("root.res"); - expect(scan.entries[1].name).toBe("coll/de.res"); - }); - - test("computes data sizes from entry offsets", () => { - const names = [ - "root.res", - "en.res", - "coll/de.res", - ...Array.from({ length: 100 }, (_, i) => `extra${i}.res`), - ]; - const buf = buildSyntheticBlob("icudt75l", names); - const offset = findIcuBlob(buf); - const scan = parseIcuToc(buf, offset); - - // First two entries should have dataSize = 64 (the entryDataSize) - expect(scan.entries[0].dataSize).toBe(64); - expect(scan.entries[1].dataSize).toBe(64); - // Last entry size is estimated, should be > 0 - const lastEntry = scan.entries.at(-1); - expect(lastEntry).toBeDefined(); - expect(lastEntry!.dataSize).toBeGreaterThan(0); - }); -}); - -describe("shouldRemoveEntry", () => { - test("removes .cnv files", () => { - expect(shouldRemoveEntry("ibm-1252_P100-2000.cnv")).toBe(true); - expect(shouldRemoveEntry("iso-8859-1.cnv")).toBe(true); - }); - - test("removes brkitr .dict files", () => { - expect(shouldRemoveEntry("brkitr/cjdict.dict")).toBe(true); - expect(shouldRemoveEntry("brkitr/burmesedict.dict")).toBe(true); - expect(shouldRemoveEntry("brkitr/khmerdict.dict")).toBe(true); - }); - - test("keeps brkitr .brk files", () => { - expect(shouldRemoveEntry("brkitr/word.brk")).toBe(false); - expect(shouldRemoveEntry("brkitr/line.brk")).toBe(false); - }); - - test("removes non-English locale data in subcategories", () => { - expect(shouldRemoveEntry("coll/de.res")).toBe(true); - expect(shouldRemoveEntry("coll/fr.res")).toBe(true); - expect(shouldRemoveEntry("coll/ja.res")).toBe(true); - expect(shouldRemoveEntry("zone/de.res")).toBe(true); - expect(shouldRemoveEntry("curr/zh.res")).toBe(true); - expect(shouldRemoveEntry("lang/ko.res")).toBe(true); - expect(shouldRemoveEntry("unit/ar.res")).toBe(true); - expect(shouldRemoveEntry("region/pt.res")).toBe(true); - expect(shouldRemoveEntry("rbnf/ru.res")).toBe(true); - expect(shouldRemoveEntry("translit/el.res")).toBe(true); - }); - - test("keeps root entries in subcategories", () => { - expect(shouldRemoveEntry("coll/root.res")).toBe(false); - expect(shouldRemoveEntry("zone/root.res")).toBe(false); - expect(shouldRemoveEntry("curr/root.res")).toBe(false); - }); - - test("keeps English entries in subcategories", () => { - expect(shouldRemoveEntry("coll/en.res")).toBe(false); - expect(shouldRemoveEntry("coll/en_US.res")).toBe(false); - expect(shouldRemoveEntry("zone/en.res")).toBe(false); - expect(shouldRemoveEntry("zone/en_GB.res")).toBe(false); - }); - - test("keeps res_index and pool files in subcategories", () => { - expect(shouldRemoveEntry("coll/res_index.res")).toBe(false); - expect(shouldRemoveEntry("coll/pool.res")).toBe(false); - expect(shouldRemoveEntry("zone/res_index.res")).toBe(false); - }); - - test("keeps supplemental data", () => { - expect(shouldRemoveEntry("coll/ucadata.res")).toBe(false); - }); - - test("keeps root-level .res files", () => { - expect(shouldRemoveEntry("root.res")).toBe(false); - expect(shouldRemoveEntry("en.res")).toBe(false); - expect(shouldRemoveEntry("de.res")).toBe(false); - expect(shouldRemoveEntry("ja.res")).toBe(false); - }); - - test("keeps .nrm, .icu, .cfu files", () => { - expect(shouldRemoveEntry("nfc.nrm")).toBe(false); - expect(shouldRemoveEntry("uprops.icu")).toBe(false); - expect(shouldRemoveEntry("confusables.cfu")).toBe(false); - }); -}); - -describe("holePunch (apply)", () => { - test("zeros data for removable entries", () => { - const entryNames = [ - // Should be kept (200 entries to pass the >100 validation) - ...Array.from({ length: 150 }, (_, i) => `entry${i}.res`), - // Should be removed - "ibm-1252.cnv", - "coll/de.res", - "coll/fr.res", - "zone/ja.res", - "brkitr/cjdict.dict", - // Should be kept - "coll/root.res", - "coll/en.res", - "coll/res_index.res", - ]; - const buf = buildSyntheticBlob("icudt75l", entryNames); - const offset = findIcuBlob(buf); - const scan = parseIcuToc(buf, offset); - const stats = holePunch(buf, scan); - - // Verify counts - expect(stats.totalEntries).toBe(entryNames.length); - expect(stats.removedEntries).toBe(5); - expect(stats.keptEntries).toBe(entryNames.length - 5); - - // Verify removed entries are actually zeroed - for (const entry of scan.entries) { - if (entry.shouldRemove) { - expect(isZeroed(buf, entry.dataOffset, entry.dataSize)).toBe(true); - } - } - - // Verify kept entries still have their data - for (const entry of scan.entries) { - if (!entry.shouldRemove && entry.dataSize > 0) { - // Non-last entries should still be 0xff - const idx = scan.entries.indexOf(entry); - if (idx < scan.entries.length - 1) { - expect(isNonZero(buf, entry.dataOffset, entry.dataSize)).toBe(true); - } - } - } - }); - - test("returns zero stats when nothing is removable", () => { - const entryNames = Array.from({ length: 150 }, (_, i) => `entry${i}.res`); - const buf = buildSyntheticBlob("icudt75l", entryNames); - const offset = findIcuBlob(buf); - const scan = parseIcuToc(buf, offset); - const stats = holePunch(buf, scan); - - expect(stats.removedEntries).toBe(0); - expect(stats.bytesZeroed).toBe(0); - expect(stats.keptEntries).toBe(150); - }); - - test("preserves TOC structure after hole-punch", () => { - const entryNames = [ - ...Array.from({ length: 150 }, (_, i) => `entry${i}.res`), - "ibm-1252.cnv", - "coll/de.res", - ]; - const buf = buildSyntheticBlob("icudt75l", entryNames); - const offset = findIcuBlob(buf); - - // Parse before hole-punch - const scanBefore = parseIcuToc(buf, offset); - const namesBefore = scanBefore.entries.map((e) => e.name); - - // Apply hole-punch - holePunch(buf, scanBefore); - - // Parse again — TOC should be identical - const scanAfter = parseIcuToc(buf, offset); - const namesAfter = scanAfter.entries.map((e) => e.name); - - expect(namesAfter).toEqual(namesBefore); - expect(scanAfter.entryCount).toBe(scanBefore.entryCount); - }); - - test("handles entries with dataOffset past buffer bounds (safeSize <= 0)", () => { - const entryNames = Array.from({ length: 150 }, (_, i) => `entry${i}.res`); - const buf = buildSyntheticBlob("icudt75l", entryNames); - const offset = findIcuBlob(buf); - const scan = parseIcuToc(buf, offset); - - // Force last entry's dataOffset past the buffer to trigger safeSize <= 0 - const lastEntry = scan.entries.at(-1)!; - lastEntry.dataOffset = buf.length + 100; - lastEntry.dataSize = 64; - lastEntry.shouldRemove = false; - - const stats = holePunch(buf, scan); - // The out-of-bounds entry should be counted as "kept" (skipped) - expect(stats.keptEntries).toBe(150); - }); -}); - -describe("estimateLastEntrySize", () => { - test("returns 4096 for fewer than 2 entries", () => { - const singleEntry: IcuEntry[] = [ - { name: "root.res", dataOffset: 1000, dataSize: 0, shouldRemove: false }, - ]; - expect(estimateLastEntrySize(singleEntry)).toBe(4096); - }); - - test("returns 4096 for empty array", () => { - expect(estimateLastEntrySize([])).toBe(4096); - }); - - test("estimates based on average entry size for multiple entries", () => { - const entries: IcuEntry[] = [ - { name: "a.res", dataOffset: 1000, dataSize: 64, shouldRemove: false }, - { name: "b.res", dataOffset: 1064, dataSize: 64, shouldRemove: false }, - { name: "c.res", dataOffset: 1128, dataSize: 0, shouldRemove: false }, - ]; - // Average size = (1128 - 1000) / 2 = 64, estimated = min(64*2, 65536) = 128 - expect(estimateLastEntrySize(entries)).toBe(128); - }); -}); - -describe("parseIcuToc (error paths)", () => { - test("throws when entry count is too low (< 100)", () => { - // Build blob with only 50 entries — below the 100 minimum threshold - const buf = buildSyntheticBlob("icudt75l", ["root.res"], 256); - const offset = findIcuBlob(buf); - // The blob has 1 entry but the validation requires >= 100 - expect(() => parseIcuToc(buf, offset)).toThrow( - /Unexpected ICU entry count/ - ); - }); - - test("throws when entry count is too high (> 10000)", () => { - // Create a minimal blob and manually set entry count to an absurd value - const buf = buildSyntheticBlob( - "icudt75l", - Array.from({ length: 200 }, (_, i) => `e${i}.res`), - 256 - ); - const offset = findIcuBlob(buf); - const headerSize = buf.readUInt16LE(offset); - const tocStart = offset + headerSize; - // Overwrite entryCount with 99999 - buf.writeUInt32LE(99_999, tocStart); - - expect(() => parseIcuToc(buf, offset)).toThrow( - /Unexpected ICU entry count/ - ); - }); -}); - -describe("formatSize", () => { - test("formats megabytes", () => { - expect(formatSize(1024 * 1024)).toBe("1.0 MB"); - expect(formatSize(5.5 * 1024 * 1024)).toBe("5.5 MB"); - expect(formatSize(29.3 * 1024 * 1024)).toBe("29.3 MB"); - }); - - test("formats kilobytes", () => { - expect(formatSize(1024)).toBe("1.0 KB"); - expect(formatSize(512 * 1024)).toBe("512.0 KB"); - expect(formatSize(2048)).toBe("2.0 KB"); - }); - - test("formats bytes", () => { - expect(formatSize(0)).toBe("0 B"); - expect(formatSize(1)).toBe("1 B"); - expect(formatSize(1023)).toBe("1023 B"); - }); -}); - -describe("processBinary", () => { - test("processes a file with ICU data and returns stats", () => { - const entryNames = [ - ...Array.from({ length: 150 }, (_, i) => `entry${i}.res`), - "ibm-1252.cnv", - "coll/de.res", - "coll/root.res", // Kept entry at end (last entry is never zeroed) - ]; - const buf = buildSyntheticBlob("icudt75l", entryNames); - - const dir = mkdtempSync(join(tmpdir(), "hole-punch-test-")); - const filePath = join(dir, "test-binary"); - writeFileSync(filePath, buf); - - const stats = processBinary(filePath); - expect(stats).not.toBeNull(); - expect(stats!.totalEntries).toBe(entryNames.length); - expect(stats!.removedEntries).toBe(2); // .cnv + coll/de.res - expect(stats!.bytesZeroed).toBeGreaterThan(0); - }); - - test("returns null for a file without ICU data", () => { - const buf = Buffer.alloc(4096); - const dir = mkdtempSync(join(tmpdir(), "hole-punch-test-")); - const filePath = join(dir, "no-icu-binary"); - writeFileSync(filePath, buf); - - const stats = processBinary(filePath); - expect(stats).toBeNull(); - }); -}); - -describe("runCli", () => { - test("returns error when no file arguments given", () => { - const result = runCli([]); - expect("error" in result).toBe(true); - if ("error" in result) { - expect(result.error).toContain("Usage:"); - } - }); - - test("returns error when only flags given (no files)", () => { - const result = runCli(["--verbose"]); - expect("error" in result).toBe(true); - if ("error" in result) { - expect(result.error).toContain("Usage:"); - } - }); - - test("returns error for non-existent file", () => { - const result = runCli(["/tmp/nonexistent-binary-xyz-12345"]); - expect("error" in result).toBe(true); - if ("error" in result) { - expect(result.error).toContain("File not found"); - } - }); - - test("returns error for a directory (not a file)", () => { - const dir = mkdtempSync(join(tmpdir(), "hole-punch-cli-")); - const result = runCli([dir]); - expect("error" in result).toBe(true); - if ("error" in result) { - expect(result.error).toContain("Not a file"); - } - }); - - test("returns no_icu status for file without ICU data", () => { - const dir = mkdtempSync(join(tmpdir(), "hole-punch-cli-")); - const filePath = join(dir, "empty-binary"); - writeFileSync(filePath, Buffer.alloc(4096)); - - const result = runCli([filePath]); - expect("results" in result).toBe(true); - if ("results" in result) { - expect(result.results).toHaveLength(1); - expect(result.results[0].status).toBe("no_icu"); - } - }); - - test("returns success status with stats for valid binary", () => { - const entryNames = [ - ...Array.from({ length: 150 }, (_, i) => `entry${i}.res`), - "ibm-1252.cnv", - "coll/de.res", - "coll/root.res", // Kept entry at end (last entry is never zeroed) - ]; - const buf = buildSyntheticBlob("icudt75l", entryNames); - const dir = mkdtempSync(join(tmpdir(), "hole-punch-cli-")); - const filePath = join(dir, "test-binary"); - writeFileSync(filePath, buf); - - const result = runCli([filePath]); - expect("results" in result).toBe(true); - if ("results" in result) { - expect(result.results).toHaveLength(1); - expect(result.results[0].status).toBe("success"); - expect(result.results[0].stats).toBeDefined(); - expect(result.results[0].stats!.removedEntries).toBe(2); - expect(result.results[0].originalSize).toBeGreaterThan(0); - } - }); - - test("returns no_removable status when all entries are kept", () => { - // Build a blob with only root-level .res entries (none removable) - const entryNames = Array.from({ length: 150 }, (_, i) => `entry${i}.res`); - const buf = buildSyntheticBlob("icudt75l", entryNames); - const dir = mkdtempSync(join(tmpdir(), "hole-punch-cli-")); - const filePath = join(dir, "test-binary"); - writeFileSync(filePath, buf); - - const result = runCli([filePath]); - expect("results" in result).toBe(true); - if ("results" in result) { - expect(result.results).toHaveLength(1); - expect(result.results[0].status).toBe("no_removable"); - } - }); - - test("processes multiple files", () => { - const entryNames = [ - ...Array.from({ length: 150 }, (_, i) => `entry${i}.res`), - "ibm-1252.cnv", - "root.res", // Kept entry at end (last entry is never zeroed) - ]; - const buf1 = buildSyntheticBlob("icudt75l", entryNames); - const buf2 = Buffer.alloc(4096); // no ICU data - - const dir = mkdtempSync(join(tmpdir(), "hole-punch-cli-")); - const filePath1 = join(dir, "binary1"); - const filePath2 = join(dir, "binary2"); - writeFileSync(filePath1, buf1); - writeFileSync(filePath2, buf2); - - const result = runCli([filePath1, filePath2]); - expect("results" in result).toBe(true); - if ("results" in result) { - expect(result.results).toHaveLength(2); - expect(result.results[0].status).toBe("success"); - expect(result.results[1].status).toBe("no_icu"); - } - }); - - test("filters out flag arguments from file paths", () => { - const entryNames = [ - ...Array.from({ length: 150 }, (_, i) => `entry${i}.res`), - "ibm-1252.cnv", - "root.res", // Kept entry at end (last entry is never zeroed) - ]; - const buf = buildSyntheticBlob("icudt75l", entryNames); - const dir = mkdtempSync(join(tmpdir(), "hole-punch-cli-")); - const filePath = join(dir, "test-binary"); - writeFileSync(filePath, buf); - - const result = runCli(["--verbose", filePath, "-v"]); - expect("results" in result).toBe(true); - if ("results" in result) { - expect(result.results).toHaveLength(1); - expect(result.results[0].status).toBe("success"); - } - }); -});