From f993411cc2ad76bd36de0c6318258cfe1d233fb0 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Mon, 24 Nov 2025 14:50:06 -0500 Subject: [PATCH 1/7] util: improve textencoder encodeInto performance Co-authored-by: Erik Corry Co-authored-by: Daniel Lemire --- src/encoding_binding.cc | 172 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 164 insertions(+), 8 deletions(-) diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index 877ae8a18f6b8f..91463ccd997966 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -6,8 +6,10 @@ #include "node_external_reference.h" #include "simdutf.h" #include "string_bytes.h" +#include "util.h" #include "v8.h" +#include #include namespace node { @@ -71,6 +73,90 @@ InternalFieldInfoBase* BindingData::Serialize(int index) { return info; } +// The following code is adapted from Cloudflare workers. +// Particularly from: https://github.com/cloudflare/workerd/pull/5448 +// +// Copyright (c) 2017-2025 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 +namespace { +constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096; + +constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) { + return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00; +} + +constexpr size_t simpleUtfEncodingLength(uint16_t c) { + if (c < 0x80) return 1; + if (c < 0x400) return 2; + return 3; +} + +template +size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { + size_t pos = 0; + size_t utf8Accumulated = 0; + constexpr size_t CHUNK = 257; + constexpr bool UTF16 = sizeof(Char) == 2; + constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2; + + double expansion = 1.15; + + while (pos < length && utf8Accumulated < bufferSize) { + size_t remainingInput = length - pos; + size_t spaceRemaining = bufferSize - utf8Accumulated; + DCHECK_GE(expansion, 1.15); + + size_t guaranteedToFit = spaceRemaining / MAX_FACTOR; + if (guaranteedToFit >= remainingInput) { + return length; + } + size_t likelyToFit = + std::min(static_cast(spaceRemaining / expansion), CHUNK); + size_t fitEstimate = + std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit)); + size_t chunkSize = std::min(remainingInput, fitEstimate); + if (chunkSize == 1) break; + DCHECK_GE(chunkSize, 1); + + size_t chunkUtf8Len; + if constexpr (UTF16) { + // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when + // available For now, validate and use utf8_length_from_utf16 + chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize); + } else { + chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize); + } + + if (utf8Accumulated + chunkUtf8Len > bufferSize) { + DCHECK_GT(chunkSize, guaranteedToFit); + expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize); + } else { + expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize); + pos += chunkSize; + utf8Accumulated += chunkUtf8Len; + } + } + + while (pos < length && utf8Accumulated < bufferSize) { + size_t extra = simpleUtfEncodingLength(data[pos]); + if (utf8Accumulated + extra > bufferSize) break; + pos++; + utf8Accumulated += extra; + } + + if (UTF16 && pos != 0 && pos != length && + isSurrogatePair(data[pos - 1], data[pos])) { + if (utf8Accumulated < bufferSize) { + pos++; + } else { + pos--; + } + } + return pos; +} +} // namespace + void BindingData::Deserialize(Local context, Local holder, int index, @@ -98,18 +184,88 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { Local dest = args[1].As(); Local buf = dest->Buffer(); + + // Handle detached buffers - return {read: 0, written: 0} + if (buf->Data() == nullptr) { + binding_data->encode_into_results_buffer_[0] = 0; + binding_data->encode_into_results_buffer_[1] = 0; + return; + } + char* write_result = static_cast(buf->Data()) + dest->ByteOffset(); size_t dest_length = dest->ByteLength(); - size_t nchars; - size_t written = source->WriteUtf8V2(isolate, - write_result, - dest_length, - String::WriteFlags::kReplaceInvalidUtf8, - &nchars); + size_t read = 0; + size_t written = 0; + v8::String::ValueView view(isolate, source); + size_t length = view.length(); + + if (view.is_one_byte()) { + auto data = reinterpret_cast(view.data8()); + simdutf::result result = simdutf::validate_ascii_with_errors(data, length); + // Only copy what fits in the destination + written = read = std::min(result.count, dest_length); + if (read > 0) { + memcpy(write_result, data, read); + write_result += read; + data += read; + length -= read; + dest_length -= read; + } + if (length != 0 && dest_length != 0) { + size_t rest = findBestFit(data, length, dest_length); + if (rest != 0) { + DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length); + written += simdutf::convert_latin1_to_utf8(data, rest, write_result); + read += rest; + } + } + } else { + auto data = reinterpret_cast(view.data16()); + + // Check if input has unpaired surrogates - if so, convert to well-formed + // first + simdutf::result validation_result = + simdutf::validate_utf16_with_errors(data, length); + + if (validation_result.error == simdutf::SUCCESS) { + // Valid UTF-16 - use the fast path + read = findBestFit(data, length, dest_length); + if (read != 0) { + DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length); + written = simdutf::convert_utf16_to_utf8(data, read, write_result); + } + } else { + // Invalid UTF-16 with unpaired surrogates - convert to well-formed first + // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when + // available + // Limit conversion to what could fit in destination, avoiding splitting + // a valid surrogate pair at the boundary + size_t safe_length = std::min(length, dest_length); + if (safe_length > 0 && safe_length < view.length() && + isSurrogatePair(data[safe_length - 1], data[safe_length])) { + safe_length--; + } + + MaybeStackBuffer conversion_buffer( + safe_length); + simdutf::to_well_formed_utf16(data, safe_length, conversion_buffer.out()); + + // Now use findBestFit with the well-formed data + read = findBestFit(conversion_buffer.out(), safe_length, dest_length); + if (read != 0) { + DCHECK_LE( + simdutf::utf8_length_from_utf16(conversion_buffer.out(), read), + dest_length); + written = simdutf::convert_utf16_to_utf8( + conversion_buffer.out(), read, write_result); + } + } + } + DCHECK_LE(written, dest_length); - binding_data->encode_into_results_buffer_[0] = nchars; - binding_data->encode_into_results_buffer_[1] = written; + binding_data->encode_into_results_buffer_[0] = static_cast(read); + binding_data->encode_into_results_buffer_[1] = static_cast(written); } // Encode a single string to a UTF-8 Uint8Array (not Buffer). From 66bc586927eb5120192ffa5ae49a2ce64b57ee0d Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Mon, 24 Nov 2025 17:22:37 -0500 Subject: [PATCH 2/7] fixup! util: improve textencoder encodeInto performance --- src/encoding_binding.cc | 52 ++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index 91463ccd997966..c9303f8ed62e06 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -198,22 +198,21 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { size_t read = 0; size_t written = 0; v8::String::ValueView view(isolate, source); - size_t length = view.length(); + size_t length_that_fits = + std::min(static_cast(view.length()), dest_length); if (view.is_one_byte()) { auto data = reinterpret_cast(view.data8()); - simdutf::result result = simdutf::validate_ascii_with_errors(data, length); - // Only copy what fits in the destination - written = read = std::min(result.count, dest_length); - if (read > 0) { - memcpy(write_result, data, read); - write_result += read; - data += read; - length -= read; - dest_length -= read; - } - if (length != 0 && dest_length != 0) { - size_t rest = findBestFit(data, length, dest_length); + simdutf::result result = + simdutf::validate_ascii_with_errors(data, length_that_fits); + written = read = result.count; + memcpy(write_result, data, read); + write_result += read; + data += read; + length_that_fits -= read; + dest_length -= read; + if (length_that_fits != 0 && dest_length != 0) { + size_t rest = findBestFit(data, length_that_fits, dest_length); if (rest != 0) { DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length); written += simdutf::convert_latin1_to_utf8(data, rest, write_result); @@ -223,14 +222,21 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { } else { auto data = reinterpret_cast(view.data16()); + // Limit conversion to what could fit in destination, avoiding splitting + // a valid surrogate pair at the boundary + if (length_that_fits > 0 && length_that_fits < view.length() && + isSurrogatePair(data[length_that_fits - 1], data[length_that_fits])) { + length_that_fits--; + } + // Check if input has unpaired surrogates - if so, convert to well-formed // first simdutf::result validation_result = - simdutf::validate_utf16_with_errors(data, length); + simdutf::validate_utf16_with_errors(data, length_that_fits); if (validation_result.error == simdutf::SUCCESS) { // Valid UTF-16 - use the fast path - read = findBestFit(data, length, dest_length); + read = findBestFit(data, view.length(), dest_length); if (read != 0) { DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length); written = simdutf::convert_utf16_to_utf8(data, read, write_result); @@ -239,20 +245,14 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { // Invalid UTF-16 with unpaired surrogates - convert to well-formed first // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when // available - // Limit conversion to what could fit in destination, avoiding splitting - // a valid surrogate pair at the boundary - size_t safe_length = std::min(length, dest_length); - if (safe_length > 0 && safe_length < view.length() && - isSurrogatePair(data[safe_length - 1], data[safe_length])) { - safe_length--; - } - MaybeStackBuffer conversion_buffer( - safe_length); - simdutf::to_well_formed_utf16(data, safe_length, conversion_buffer.out()); + length_that_fits); + simdutf::to_well_formed_utf16( + data, length_that_fits, conversion_buffer.out()); // Now use findBestFit with the well-formed data - read = findBestFit(conversion_buffer.out(), safe_length, dest_length); + read = + findBestFit(conversion_buffer.out(), length_that_fits, dest_length); if (read != 0) { DCHECK_LE( simdutf::utf8_length_from_utf16(conversion_buffer.out(), read), From eaced01d348f3c736452d472455a8b7156ce2e29 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Mon, 24 Nov 2025 19:26:16 -0500 Subject: [PATCH 3/7] Apply suggestions from code review Co-authored-by: Erik Corry --- src/encoding_binding.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index c9303f8ed62e06..61878d145aa977 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -117,12 +117,15 @@ size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit)); size_t chunkSize = std::min(remainingInput, fitEstimate); if (chunkSize == 1) break; - DCHECK_GE(chunkSize, 1); + DCHECK_GT(chunkSize, 1); size_t chunkUtf8Len; if constexpr (UTF16) { // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when // available For now, validate and use utf8_length_from_utf16 + size_t newPos = pos + chunkSize; + if (newPos < length && isSurrogatePair(data[newPos - 1], data[newPos])) + chunkSize--; chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize); } else { chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize); @@ -223,7 +226,8 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { auto data = reinterpret_cast(view.data16()); // Limit conversion to what could fit in destination, avoiding splitting - // a valid surrogate pair at the boundary + // a valid surrogate pair at the boundary, which could cause a spurious call + // of simdutf::to_well_formed_utf16() if (length_that_fits > 0 && length_that_fits < view.length() && isSurrogatePair(data[length_that_fits - 1], data[length_that_fits])) { length_that_fits--; @@ -236,7 +240,7 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { if (validation_result.error == simdutf::SUCCESS) { // Valid UTF-16 - use the fast path - read = findBestFit(data, view.length(), dest_length); + read = findBestFit(data, length_that_fits, dest_length); if (read != 0) { DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length); written = simdutf::convert_utf16_to_utf8(data, read, write_result); @@ -262,7 +266,7 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { } } } - DCHECK_LE(written, dest_length); + DCHECK_LE(written, dest->ByteLength();); binding_data->encode_into_results_buffer_[0] = static_cast(read); binding_data->encode_into_results_buffer_[1] = static_cast(written); From ee71c43d257ba19d8a911a02bb4ab5bb30478836 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 25 Nov 2025 14:53:15 -0500 Subject: [PATCH 4/7] add old way for <= 32 length --- src/encoding_binding.cc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index 61878d145aa977..771f04a44d7d67 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -198,6 +198,19 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { char* write_result = static_cast(buf->Data()) + dest->ByteOffset(); size_t dest_length = dest->ByteLength(); + // For small strings (length <= 32), use the old V8 path for better performance + if (source->Length() <= 32) { + size_t nchars; + size_t written = source->WriteUtf8V2(isolate, + write_result, + dest_length, + String::WriteFlags::kReplaceInvalidUtf8, + &nchars); + binding_data->encode_into_results_buffer_[0] = nchars; + binding_data->encode_into_results_buffer_[1] = written; + return; + } + size_t read = 0; size_t written = 0; v8::String::ValueView view(isolate, source); From 435f05d2709f3bf3f83d1f65db5e3a570b85f91c Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 25 Nov 2025 14:54:50 -0500 Subject: [PATCH 5/7] convert DCHECK to CHECK --- src/encoding_binding.cc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index 771f04a44d7d67..8bb79a344548a5 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -117,7 +117,7 @@ size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit)); size_t chunkSize = std::min(remainingInput, fitEstimate); if (chunkSize == 1) break; - DCHECK_GT(chunkSize, 1); + CHECK_GT(chunkSize, 1); size_t chunkUtf8Len; if constexpr (UTF16) { @@ -198,14 +198,16 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { char* write_result = static_cast(buf->Data()) + dest->ByteOffset(); size_t dest_length = dest->ByteLength(); - // For small strings (length <= 32), use the old V8 path for better performance + // For small strings (length <= 32), use the old V8 path for better + // performance if (source->Length() <= 32) { size_t nchars; - size_t written = source->WriteUtf8V2(isolate, - write_result, - dest_length, - String::WriteFlags::kReplaceInvalidUtf8, - &nchars); + size_t written = + source->WriteUtf8V2(isolate, + write_result, + dest_length, + String::WriteFlags::kReplaceInvalidUtf8, + &nchars); binding_data->encode_into_results_buffer_[0] = nchars; binding_data->encode_into_results_buffer_[1] = written; return; From 4cec6dc993d24128aef9a2a74e2e5f6f0cdbd7b1 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 25 Nov 2025 19:44:04 -0500 Subject: [PATCH 6/7] Update encoding_binding.cc --- src/encoding_binding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index 8bb79a344548a5..f936e48bcc5143 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -281,7 +281,7 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { } } } - DCHECK_LE(written, dest->ByteLength();); + DCHECK_LE(written, dest->ByteLength()); binding_data->encode_into_results_buffer_[0] = static_cast(read); binding_data->encode_into_results_buffer_[1] = static_cast(written); From 730da98b4c10756fca7064297c995c79d86e907e Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Sat, 29 Nov 2025 21:52:50 -0500 Subject: [PATCH 7/7] address pr reviews --- src/encoding_binding.cc | 46 ++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index f936e48bcc5143..88d49d7fcd134d 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -92,6 +92,26 @@ constexpr size_t simpleUtfEncodingLength(uint16_t c) { return 3; } +// Finds the maximum number of input characters (UTF-16 or Latin1) that can be +// encoded into a UTF-8 buffer of the given size. +// +// The challenge is that UTF-8 encoding expands characters by variable amounts: +// - ASCII (< 0x80): 1 byte +// - Code points < 0x800: 2 bytes +// - Other BMP characters: 3 bytes +// - Surrogate pairs (supplementary planes): 4 bytes total +// +// This function uses an adaptive chunking algorithm: +// 1. Process the input in chunks, estimating how many characters will fit +// 2. Calculate the actual UTF-8 length for each chunk using simdutf +// 3. Adjust the expansion factor based on observed encoding ratios +// 4. Fall back to character-by-character processing near the buffer boundary +// 5. Handle UTF-16 surrogate pairs to avoid splitting them across boundaries +// +// The algorithm starts with a conservative expansion estimate (1.15x) and +// dynamically adjusts based on actual character distribution, making it +// efficient for common ASCII-heavy text while remaining correct for +// multi-byte heavy content. template size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { size_t pos = 0; @@ -197,24 +217,23 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { char* write_result = static_cast(buf->Data()) + dest->ByteOffset(); size_t dest_length = dest->ByteLength(); + size_t read = 0; + size_t written = 0; // For small strings (length <= 32), use the old V8 path for better // performance - if (source->Length() <= 32) { - size_t nchars; - size_t written = - source->WriteUtf8V2(isolate, - write_result, - dest_length, - String::WriteFlags::kReplaceInvalidUtf8, - &nchars); - binding_data->encode_into_results_buffer_[0] = nchars; - binding_data->encode_into_results_buffer_[1] = written; + static constexpr int kSmallStringThreshold = 32; + if (source->Length() <= kSmallStringThreshold) { + written = source->WriteUtf8V2(isolate, + write_result, + dest_length, + String::WriteFlags::kReplaceInvalidUtf8, + &read); + binding_data->encode_into_results_buffer_[0] = static_cast(read); + binding_data->encode_into_results_buffer_[1] = static_cast(written); return; } - size_t read = 0; - size_t written = 0; v8::String::ValueView view(isolate, source); size_t length_that_fits = std::min(static_cast(view.length()), dest_length); @@ -230,8 +249,7 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { length_that_fits -= read; dest_length -= read; if (length_that_fits != 0 && dest_length != 0) { - size_t rest = findBestFit(data, length_that_fits, dest_length); - if (rest != 0) { + if (size_t rest = findBestFit(data, length_that_fits, dest_length)) { DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length); written += simdutf::convert_latin1_to_utf8(data, rest, write_result); read += rest;