From cc39cf3e5d9c6eef43648b3b699dcf427812ef8d Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 31 Oct 2025 10:50:03 -0400 Subject: [PATCH 01/29] experiment with value view and simdutf --- src/workerd/api/encoding.c++ | 62 ++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 3dc4d2d9367..e8fa5e797c0 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -491,25 +491,53 @@ jsg::Ref TextEncoder::constructor(jsg::Lock& js) { return js.alloc(); } -namespace { -TextEncoder::EncodeIntoResult encodeIntoImpl( - jsg::Lock& js, jsg::JsString input, jsg::BufferSource& buffer) { - auto result = input.writeInto( - js, buffer.asArrayPtr().asChars(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(result.read), - .written = static_cast(result.written), - }; -} -} // namespace - jsg::BufferSource TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { auto str = input.orDefault(js.str()); - auto view = JSG_REQUIRE_NONNULL(jsg::BufferSource::tryAlloc(js, str.utf8Length(js)), RangeError, - "Cannot allocate space for TextEncoder.encode"); - [[maybe_unused]] auto result = encodeIntoImpl(js, str, view); - KJ_DASSERT(result.written == view.size()); - return kj::mv(view); + + // Do the conversion while ValueView is alive, but to a C++ heap buffer (not V8 heap) + kj::Array output_data; + + { + v8::String::ValueView value_view(js.v8Isolate, str); + size_t length = static_cast(value_view.length()); + + if (value_view.is_one_byte()) { + auto data = reinterpret_cast(value_view.data8()); + size_t utf8_length = simdutf::utf8_length_from_latin1(data, length); + output_data = kj::heapArray(utf8_length); + [[maybe_unused]] auto written = + simdutf::convert_latin1_to_utf8(data, length, output_data.asChars().begin()); + KJ_DASSERT(written == output_data.size()); + } else { + auto data = reinterpret_cast(value_view.data16()); + + // Check if UTF-16LE is valid + auto validation_result = simdutf::validate_utf16le(data, length); + + if (validation_result) { + // Valid UTF-16LE, convert directly + size_t utf8_length = simdutf::utf8_length_from_utf16le(data, length); + output_data = kj::heapArray(utf8_length); + [[maybe_unused]] auto written = + simdutf::convert_utf16le_to_utf8(data, length, output_data.asChars().begin()); + KJ_DASSERT(written == output_data.size()); + } else { + // Invalid UTF-16LE (unpaired surrogates), fix it first + auto well_formed = kj::heapArray(length); + simdutf::to_well_formed_utf16le(data, length, well_formed.begin()); + + // Now convert the well-formed UTF-16LE to UTF-8 + size_t utf8_length = simdutf::utf8_length_from_utf16le(well_formed.begin(), length); + output_data = kj::heapArray(utf8_length); + [[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8( + well_formed.begin(), length, output_data.asChars().begin()); + KJ_DASSERT(written == output_data.size()); + } + } + } // ValueView destroyed here, releasing the heap lock + + // Now create BufferSource from the output data (this allocates V8 objects, which is now safe) + return jsg::BufferSource(js, jsg::BackingStore::from(js, kj::mv(output_data))); } TextEncoder::EncodeIntoResult TextEncoder::encodeInto( From d93fcf7e6d434dbf7bcd39ea91a20291b2656657 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 31 Oct 2025 13:05:25 -0400 Subject: [PATCH 02/29] address pr reviews --- src/workerd/api/encoding.c++ | 91 +++++++++++++++++++++------------- src/workerd/jsg/buffersource.h | 7 +-- 2 files changed, 60 insertions(+), 38 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index e8fa5e797c0..83e875fbd01 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -492,52 +492,73 @@ jsg::Ref TextEncoder::constructor(jsg::Lock& js) { } jsg::BufferSource TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { - auto str = input.orDefault(js.str()); + jsg::JsString str = input.orDefault(js.str()); - // Do the conversion while ValueView is alive, but to a C++ heap buffer (not V8 heap) - kj::Array output_data; + if (str.length(js) == 0) { + return jsg::BufferSource(js, jsg::BackingStore::alloc(js, 0)); + } - { + // Allocate the output buffer and perform the conversion while ValueView is alive, but defer + // creating the V8 BufferSource until after ValueView is destroyed. This approach uses + // BackingStore::wrap with a custom disposer to avoid the copy overhead that would occur with + // BackingStore::from in the v8 sandbox, since from() copies data when it's not already in the + // sandbox. By using new/delete with wrap(), we maintain ownership semantics compatible with V8's + // C-style BackingStore API while avoiding the extra allocation and copy. + jsg::BackingStore backing = [&]() { v8::String::ValueView value_view(js.v8Isolate, str); size_t length = static_cast(value_view.length()); if (value_view.is_one_byte()) { + // Fast path for Latin-1 encoded strings. V8 uses Latin-1 (ISO-8859-1) encoding internally + // for strings that contain only code points <= U+00FF. We need to convert to UTF-8. auto data = reinterpret_cast(value_view.data8()); size_t utf8_length = simdutf::utf8_length_from_latin1(data, length); - output_data = kj::heapArray(utf8_length); + auto* output = new kj::Array(kj::heapArray(utf8_length)); [[maybe_unused]] auto written = - simdutf::convert_latin1_to_utf8(data, length, output_data.asChars().begin()); - KJ_DASSERT(written == output_data.size()); - } else { - auto data = reinterpret_cast(value_view.data16()); - - // Check if UTF-16LE is valid - auto validation_result = simdutf::validate_utf16le(data, length); - - if (validation_result) { - // Valid UTF-16LE, convert directly - size_t utf8_length = simdutf::utf8_length_from_utf16le(data, length); - output_data = kj::heapArray(utf8_length); - [[maybe_unused]] auto written = - simdutf::convert_utf16le_to_utf8(data, length, output_data.asChars().begin()); - KJ_DASSERT(written == output_data.size()); - } else { - // Invalid UTF-16LE (unpaired surrogates), fix it first - auto well_formed = kj::heapArray(length); - simdutf::to_well_formed_utf16le(data, length, well_formed.begin()); - - // Now convert the well-formed UTF-16LE to UTF-8 - size_t utf8_length = simdutf::utf8_length_from_utf16le(well_formed.begin(), length); - output_data = kj::heapArray(utf8_length); - [[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8( - well_formed.begin(), length, output_data.asChars().begin()); - KJ_DASSERT(written == output_data.size()); - } + simdutf::convert_latin1_to_utf8(data, length, output->asChars().begin()); + KJ_DASSERT(written == output->size()); + return jsg::BackingStore::wrap(output->begin(), output->size(), + [](void*, size_t, void* ptr) { delete reinterpret_cast*>(ptr); }, + output); + } + + // Two-byte string path. V8 uses UTF-16LE encoding internally for strings with code points + // > U+00FF. Check if the UTF-16 is valid (no unpaired surrogates) to determine the path. + auto data = reinterpret_cast(value_view.data16()); + auto valid_utf16 = simdutf::validate_utf16le(data, length); + + if (valid_utf16) { + // Common case: valid UTF-16LE, convert directly to UTF-8 + size_t utf8_length = simdutf::utf8_length_from_utf16le(data, length); + auto* output = new kj::Array(kj::heapArray(utf8_length)); + [[maybe_unused]] auto written = + simdutf::convert_utf16le_to_utf8(data, length, output->asChars().begin()); + KJ_DASSERT(written == output->size()); + return jsg::BackingStore::wrap(output->begin(), output->size(), + [](void*, size_t, void* ptr) { delete reinterpret_cast*>(ptr); }, + output); } - } // ValueView destroyed here, releasing the heap lock - // Now create BufferSource from the output data (this allocates V8 objects, which is now safe) - return jsg::BufferSource(js, jsg::BackingStore::from(js, kj::mv(output_data))); + // Rare case: Invalid UTF-16LE with unpaired surrogates. Per the Encoding Standard, we must + // replace unpaired surrogates with U+FFFD replacement characters. We do this in two passes: + // first fix the UTF-16, then convert to UTF-8. This extra buffer allocation only happens + // for malformed strings, which should be uncommon in practice. + auto well_formed = kj::heapArray(length); + simdutf::to_well_formed_utf16le(data, length, well_formed.begin()); + + size_t utf8_length = simdutf::utf8_length_from_utf16le(well_formed.begin(), length); + auto* output = new kj::Array(kj::heapArray(utf8_length)); + [[maybe_unused]] auto written = + simdutf::convert_utf16le_to_utf8(well_formed.begin(), length, output->asChars().begin()); + KJ_DASSERT(written == output->size()); + return jsg::BackingStore::wrap(output->begin(), output->size(), + [](void*, size_t, void* ptr) { delete reinterpret_cast*>(ptr); }, + output); + }(); // ValueView destroyed here, releasing the heap lock + + // Now that ValueView is destroyed and the heap lock is released, it's safe to create V8 objects. + // Construct the BufferSource which will create the actual Uint8Array that gets returned to JS. + return jsg::BufferSource(js, kj::mv(backing)); } TextEncoder::EncodeIntoResult TextEncoder::encodeInto( diff --git a/src/workerd/jsg/buffersource.h b/src/workerd/jsg/buffersource.h index 65fc36ab960..018e4a3b8e9 100644 --- a/src/workerd/jsg/buffersource.h +++ b/src/workerd/jsg/buffersource.h @@ -102,9 +102,10 @@ class BackingStore { // Creates a new BackingStore of the given size. template - static BackingStore alloc(Lock& js, size_t size) { - return BackingStore(js.allocBackingStore(size), size, 0, getBufferSourceElementSize(), - construct, checkIsIntegerType()); + static BackingStore alloc( + Lock& js, size_t size, Lock::AllocOption init_mode = Lock::AllocOption::ZERO_INITIALIZED) { + return BackingStore(js.allocBackingStore(size, init_mode), size, 0, + getBufferSourceElementSize(), construct, checkIsIntegerType()); } using Disposer = void(void*, size_t, void*); From 28b102d7ef79b5b90262871720056ecc26322e02 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 31 Oct 2025 14:07:10 -0400 Subject: [PATCH 03/29] address pr reviews --- src/workerd/api/encoding.c++ | 146 ++++++++++++++++++++++------------- src/workerd/api/encoding.h | 6 +- src/workerd/jsg/jsvalue.c++ | 4 + src/workerd/jsg/jsvalue.h | 7 ++ 4 files changed, 103 insertions(+), 60 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 83e875fbd01..7f51be02837 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -491,74 +491,110 @@ jsg::Ref TextEncoder::constructor(jsg::Lock& js) { return js.alloc(); } -jsg::BufferSource TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { +jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { jsg::JsString str = input.orDefault(js.str()); - if (str.length(js) == 0) { - return jsg::BufferSource(js, jsg::BackingStore::alloc(js, 0)); - } - - // Allocate the output buffer and perform the conversion while ValueView is alive, but defer - // creating the V8 BufferSource until after ValueView is destroyed. This approach uses - // BackingStore::wrap with a custom disposer to avoid the copy overhead that would occur with - // BackingStore::from in the v8 sandbox, since from() copies data when it's not already in the - // sandbox. By using new/delete with wrap(), we maintain ownership semantics compatible with V8's - // C-style BackingStore API while avoiding the extra allocation and copy. - jsg::BackingStore backing = [&]() { - v8::String::ValueView value_view(js.v8Isolate, str); - size_t length = static_cast(value_view.length()); - - if (value_view.is_one_byte()) { - // Fast path for Latin-1 encoded strings. V8 uses Latin-1 (ISO-8859-1) encoding internally - // for strings that contain only code points <= U+00FF. We need to convert to UTF-8. - auto data = reinterpret_cast(value_view.data8()); - size_t utf8_length = simdutf::utf8_length_from_latin1(data, length); - auto* output = new kj::Array(kj::heapArray(utf8_length)); - [[maybe_unused]] auto written = - simdutf::convert_latin1_to_utf8(data, length, output->asChars().begin()); - KJ_DASSERT(written == output->size()); - return jsg::BackingStore::wrap(output->begin(), output->size(), - [](void*, size_t, void* ptr) { delete reinterpret_cast*>(ptr); }, - output); + if (str.isOneByte(js)) { + auto length = str.length(js); + // Fast path for one-byte strings (Latin-1). writeOneByte() copies the raw bytes without + // flattening the string, which is more efficient than using ValueView. Note that we + // allocate `length * 2` bytes because Latin-1 characters 0x80-0xFF need 2 bytes in UTF-8. + auto backing = + jsg::BackingStore::alloc(js, length, jsg::Lock::AllocOption::UNINITIALIZED); + str.writeOneByte( + js, backing.asArrayPtr(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); + auto backingData = reinterpret_cast(backing.asArrayPtr().begin()); + + size_t utf8_length = simdutf::utf8_length_from_latin1(backingData, length); + + if (utf8_length == length) { + return jsg::JsUint8Array(backing.createHandle(js).As()); } + auto backing2 = jsg::BackingStore::alloc( + js, utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + auto written = simdutf::convert_latin1_to_utf8( + backingData, length, reinterpret_cast(backing2.asArrayPtr().begin())); + KJ_DASSERT(backing2.size() == written); + return jsg::JsUint8Array(backing2.createHandle(js).As()); + } + + // First pass: Calculate the required UTF-8 output buffer size. + // We need to do this in a separate ValueView because: + // 1. ValueView holds the V8 heap lock, which prevents us from allocating new V8 objects + // 2. We must determine the exact output size before allocating the BackingStore + // 3. Once we know the size, we'll create a second ValueView to do the actual conversion + size_t utf8_length = 0; + bool isValidUtf16 = true; + // For invalid UTF-16 strings (with unpaired surrogates), we need to fix them to well-formed + // UTF-16 before calculating the UTF-8 length. We store the fixed version here so it can be + // reused in the second pass, avoiding the need to fix it twice. + kj::Array wellFormed; + + { + v8::String::ValueView view(js.v8Isolate, str); + // One-byte strings are handled by the fast path above + KJ_DASSERT(!view.is_one_byte()); + + auto data = reinterpret_cast(view.data16()); // Two-byte string path. V8 uses UTF-16LE encoding internally for strings with code points // > U+00FF. Check if the UTF-16 is valid (no unpaired surrogates) to determine the path. - auto data = reinterpret_cast(value_view.data16()); - auto valid_utf16 = simdutf::validate_utf16le(data, length); - - if (valid_utf16) { + isValidUtf16 = simdutf::validate_utf16le(data, view.length()); + + if (isValidUtf16) { + // Common case: valid UTF-16, calculate UTF-8 length directly + utf8_length = simdutf::utf8_length_from_utf16le(data, view.length()); + } else { + // Rare case: Invalid UTF-16 with unpaired surrogates. Per the Encoding Standard, + // unpaired surrogates must be replaced with U+FFFD (replacement character). + // U+FFFD is 3 bytes in UTF-8, which means the UTF-8 length will differ from what + // we'd calculate from the invalid UTF-16. We must fix the UTF-16 first, then + // calculate the UTF-8 length from the well-formed version to get the correct size. + wellFormed = kj::heapArray(view.length()); + simdutf::to_well_formed_utf16le(data, view.length(), wellFormed.begin()); + utf8_length = simdutf::utf8_length_from_utf16le(wellFormed.begin(), view.length()); + } + } // ValueView destroyed here, releasing the heap lock + + // Pre-allocate the jsg::BackingStore to avoid the copy overhead that would occur with + // BackingStore::from() in the v8 sandbox, since from() copies data when it's not already in the + // sandbox. By pre-allocating with alloc(), the memory is already in the sandbox and we can + // perform the conversion directly into it. + auto backing = jsg::BackingStore::alloc( + js, utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + + // Second pass: Perform the actual UTF-8 conversion. + // We create a new ValueView here to access the string data again, now that we have a + // pre-allocated output buffer. The closure ensures the ValueView is destroyed before we + // return the result, which is important for proper V8 heap management. + [&]() { + v8::String::ValueView view(js.v8Isolate, str); + // One-byte strings are handled by the fast path above + KJ_DASSERT(!view.is_one_byte()); + + size_t length = static_cast(view.length()); + auto* output = backing.asArrayPtr().begin(); + auto data = reinterpret_cast(view.data16()); + + if (isValidUtf16) { // Common case: valid UTF-16LE, convert directly to UTF-8 - size_t utf8_length = simdutf::utf8_length_from_utf16le(data, length); - auto* output = new kj::Array(kj::heapArray(utf8_length)); - [[maybe_unused]] auto written = - simdutf::convert_utf16le_to_utf8(data, length, output->asChars().begin()); - KJ_DASSERT(written == output->size()); - return jsg::BackingStore::wrap(output->begin(), output->size(), - [](void*, size_t, void* ptr) { delete reinterpret_cast*>(ptr); }, - output); + [[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8(data, length, output); + KJ_DASSERT(written == backing.size()); + return; } - // Rare case: Invalid UTF-16LE with unpaired surrogates. Per the Encoding Standard, we must - // replace unpaired surrogates with U+FFFD replacement characters. We do this in two passes: - // first fix the UTF-16, then convert to UTF-8. This extra buffer allocation only happens - // for malformed strings, which should be uncommon in practice. - auto well_formed = kj::heapArray(length); - simdutf::to_well_formed_utf16le(data, length, well_formed.begin()); - - size_t utf8_length = simdutf::utf8_length_from_utf16le(well_formed.begin(), length); - auto* output = new kj::Array(kj::heapArray(utf8_length)); + // Rare case: Invalid UTF-16LE with unpaired surrogates. We already fixed the UTF-16 to + // well-formed in the first pass (stored in wellFormed array), so now we just convert that + // fixed version to UTF-8. This reuses the wellFormed array created earlier, avoiding the + // need to fix the UTF-16 a second time. [[maybe_unused]] auto written = - simdutf::convert_utf16le_to_utf8(well_formed.begin(), length, output->asChars().begin()); - KJ_DASSERT(written == output->size()); - return jsg::BackingStore::wrap(output->begin(), output->size(), - [](void*, size_t, void* ptr) { delete reinterpret_cast*>(ptr); }, - output); + simdutf::convert_utf16le_to_utf8(wellFormed.begin(), wellFormed.size(), output); + KJ_DASSERT(written == backing.size()); }(); // ValueView destroyed here, releasing the heap lock // Now that ValueView is destroyed and the heap lock is released, it's safe to create V8 objects. - // Construct the BufferSource which will create the actual Uint8Array that gets returned to JS. - return jsg::BufferSource(js, kj::mv(backing)); + // Create the Uint8Array from the BackingStore and return it to JS. + return jsg::JsUint8Array(backing.createHandle(js).As()); } TextEncoder::EncodeIntoResult TextEncoder::encodeInto( diff --git a/src/workerd/api/encoding.h b/src/workerd/api/encoding.h index 732ee916cba..e694ad1b355 100644 --- a/src/workerd/api/encoding.h +++ b/src/workerd/api/encoding.h @@ -218,7 +218,7 @@ class TextEncoder final: public jsg::Object { static jsg::Ref constructor(jsg::Lock& js); - jsg::BufferSource encode(jsg::Lock& js, jsg::Optional input); + jsg::JsUint8Array encode(jsg::Lock& js, jsg::Optional input); EncodeIntoResult encodeInto(jsg::Lock& js, jsg::JsString input, jsg::JsUint8Array buffer); @@ -236,11 +236,7 @@ class TextEncoder final: public jsg::Object { JSG_READONLY_INSTANCE_PROPERTY(encoding, getEncoding); } - // `encode()` returns `jsg::BufferSource`, which may be an `ArrayBuffer` or `ArrayBufferView`, - // but the implementation uses `jsg::BufferSource::tryAlloc()` which always tries to allocate a - // `Uint8Array`. The spec defines that this function returns a `Uint8Array` too. JSG_TS_OVERRIDE({ - encode(input?: string): Uint8Array; encodeInto(input: string, buffer: Uint8Array): TextEncoderEncodeIntoResult; }); } diff --git a/src/workerd/jsg/jsvalue.c++ b/src/workerd/jsg/jsvalue.c++ index 20a9614ed93..4eb3298fec7 100644 --- a/src/workerd/jsg/jsvalue.c++ +++ b/src/workerd/jsg/jsvalue.c++ @@ -377,6 +377,10 @@ JsString JsString::internalize(Lock& js) const { return JsString(inner->InternalizeString(js.v8Isolate)); } +void JsString::writeOneByte(Lock& js, kj::ArrayPtr buffer, WriteFlags flags) { + inner->WriteOneByteV2(js.v8Isolate, 0, buffer.size(), buffer.begin(), flags); +} + JsString::WriteIntoStatus JsString::writeInto( Lock& js, kj::ArrayPtr buffer, WriteFlags options) const { WriteIntoStatus result = {0, 0}; diff --git a/src/workerd/jsg/jsvalue.h b/src/workerd/jsg/jsvalue.h index 61481f4521d..f487f713881 100644 --- a/src/workerd/jsg/jsvalue.h +++ b/src/workerd/jsg/jsvalue.h @@ -277,6 +277,7 @@ class JsString final: public JsBase { int hashCode() const; bool isFlat() const; + bool isOneByte(Lock& js) const KJ_WARN_UNUSED_RESULT; bool containsOnlyOneByte() const; bool operator==(const JsString& other) const; @@ -311,6 +312,8 @@ class JsString final: public JsBase { WriteIntoStatus writeInto( Lock& js, kj::ArrayPtr buffer, WriteFlags options = WriteFlags::NONE) const; + void writeOneByte(Lock& js, kj::ArrayPtr buffer, WriteFlags flags = WriteFlags::NONE); + using JsBase::JsBase; }; @@ -986,6 +989,10 @@ inline int JsString::length(jsg::Lock& js) const { return inner->Length(); } +inline bool JsString::isOneByte(jsg::Lock& js) const { + return inner->IsOneByte(); +} + inline size_t JsString::utf8Length(jsg::Lock& js) const { return inner->Utf8LengthV2(js.v8Isolate); } From d6691fe57098866d23327dde83e246b2e20691ef Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Mon, 3 Nov 2025 12:41:24 -0500 Subject: [PATCH 04/29] get rid of multiple valueviews --- src/workerd/api/encoding.c++ | 118 +++++++++++++---------------------- 1 file changed, 45 insertions(+), 73 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 7f51be02837..1d2fd33d27c 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -493,108 +493,80 @@ jsg::Ref TextEncoder::constructor(jsg::Lock& js) { jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { jsg::JsString str = input.orDefault(js.str()); + std::shared_ptr backingStore; + size_t utf8_length = 0; + // Fast path: check if string is one-byte before creating ValueView if (str.isOneByte(js)) { auto length = str.length(js); - // Fast path for one-byte strings (Latin-1). writeOneByte() copies the raw bytes without - // flattening the string, which is more efficient than using ValueView. Note that we - // allocate `length * 2` bytes because Latin-1 characters 0x80-0xFF need 2 bytes in UTF-8. - auto backing = - jsg::BackingStore::alloc(js, length, jsg::Lock::AllocOption::UNINITIALIZED); - str.writeOneByte( - js, backing.asArrayPtr(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); - auto backingData = reinterpret_cast(backing.asArrayPtr().begin()); + // Allocate buffer for Latin-1. Use v8::ArrayBuffer::NewBackingStore to avoid creating + // JS objects during conversion. + backingStore = v8::ArrayBuffer::NewBackingStore( + js.v8Isolate, length, v8::BackingStoreInitializationMode::kUninitialized); + auto backingData = reinterpret_cast(backingStore->Data()); + + str.writeOneByte(js, kj::ArrayPtr(backingData, length), + jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); - size_t utf8_length = simdutf::utf8_length_from_latin1(backingData, length); + utf8_length = + simdutf::utf8_length_from_latin1(reinterpret_cast(backingData), length); if (utf8_length == length) { - return jsg::JsUint8Array(backing.createHandle(js).As()); + // ASCII fast path: no conversion needed, Latin-1 is same as UTF-8 for ASCII + auto array = v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, length); + return jsg::JsUint8Array(array); } - auto backing2 = jsg::BackingStore::alloc( - js, utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); - auto written = simdutf::convert_latin1_to_utf8( - backingData, length, reinterpret_cast(backing2.asArrayPtr().begin())); - KJ_DASSERT(backing2.size() == written); - return jsg::JsUint8Array(backing2.createHandle(js).As()); + // Need to convert Latin-1 to UTF-8 + std::shared_ptr backingStore2 = v8::ArrayBuffer::NewBackingStore( + js.v8Isolate, utf8_length, v8::BackingStoreInitializationMode::kUninitialized); + auto written = simdutf::convert_latin1_to_utf8(reinterpret_cast(backingData), + length, reinterpret_cast(backingStore2->Data())); + KJ_DASSERT(utf8_length == written); + auto array = + v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore2), 0, utf8_length); + return jsg::JsUint8Array(array); } - // First pass: Calculate the required UTF-8 output buffer size. - // We need to do this in a separate ValueView because: - // 1. ValueView holds the V8 heap lock, which prevents us from allocating new V8 objects - // 2. We must determine the exact output size before allocating the BackingStore - // 3. Once we know the size, we'll create a second ValueView to do the actual conversion - size_t utf8_length = 0; - bool isValidUtf16 = true; - // For invalid UTF-16 strings (with unpaired surrogates), we need to fix them to well-formed - // UTF-16 before calculating the UTF-8 length. We store the fixed version here so it can be - // reused in the second pass, avoiding the need to fix it twice. - kj::Array wellFormed; - + // Two-byte string path { + // Note that ValueView flattens the string, if it's not already flattened v8::String::ValueView view(js.v8Isolate, str); - // One-byte strings are handled by the fast path above - KJ_DASSERT(!view.is_one_byte()); - - auto data = reinterpret_cast(view.data16()); // Two-byte string path. V8 uses UTF-16LE encoding internally for strings with code points // > U+00FF. Check if the UTF-16 is valid (no unpaired surrogates) to determine the path. - isValidUtf16 = simdutf::validate_utf16le(data, view.length()); + auto data = reinterpret_cast(view.data16()); + bool isValidUtf16 = simdutf::validate_utf16le(data, view.length()); if (isValidUtf16) { - // Common case: valid UTF-16, calculate UTF-8 length directly + // Common case: valid UTF-16, convert directly to UTF-8 utf8_length = simdutf::utf8_length_from_utf16le(data, view.length()); + backingStore = v8::ArrayBuffer::NewBackingStore( + js.v8Isolate, utf8_length, v8::BackingStoreInitializationMode::kUninitialized); + [[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8( + data, view.length(), reinterpret_cast(backingStore->Data())); + KJ_DASSERT(written == utf8_length); } else { // Rare case: Invalid UTF-16 with unpaired surrogates. Per the Encoding Standard, // unpaired surrogates must be replaced with U+FFFD (replacement character). // U+FFFD is 3 bytes in UTF-8, which means the UTF-8 length will differ from what // we'd calculate from the invalid UTF-16. We must fix the UTF-16 first, then // calculate the UTF-8 length from the well-formed version to get the correct size. - wellFormed = kj::heapArray(view.length()); + auto wellFormed = kj::heapArray(view.length()); simdutf::to_well_formed_utf16le(data, view.length(), wellFormed.begin()); utf8_length = simdutf::utf8_length_from_utf16le(wellFormed.begin(), view.length()); + backingStore = v8::ArrayBuffer::NewBackingStore( + js.v8Isolate, utf8_length, v8::BackingStoreInitializationMode::kUninitialized); + [[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8( + wellFormed.begin(), wellFormed.size(), reinterpret_cast(backingStore->Data())); + KJ_DASSERT(written == utf8_length); } } // ValueView destroyed here, releasing the heap lock - // Pre-allocate the jsg::BackingStore to avoid the copy overhead that would occur with - // BackingStore::from() in the v8 sandbox, since from() copies data when it's not already in the - // sandbox. By pre-allocating with alloc(), the memory is already in the sandbox and we can - // perform the conversion directly into it. - auto backing = jsg::BackingStore::alloc( - js, utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); - - // Second pass: Perform the actual UTF-8 conversion. - // We create a new ValueView here to access the string data again, now that we have a - // pre-allocated output buffer. The closure ensures the ValueView is destroyed before we - // return the result, which is important for proper V8 heap management. - [&]() { - v8::String::ValueView view(js.v8Isolate, str); - // One-byte strings are handled by the fast path above - KJ_DASSERT(!view.is_one_byte()); - - size_t length = static_cast(view.length()); - auto* output = backing.asArrayPtr().begin(); - auto data = reinterpret_cast(view.data16()); - - if (isValidUtf16) { - // Common case: valid UTF-16LE, convert directly to UTF-8 - [[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8(data, length, output); - KJ_DASSERT(written == backing.size()); - return; - } - - // Rare case: Invalid UTF-16LE with unpaired surrogates. We already fixed the UTF-16 to - // well-formed in the first pass (stored in wellFormed array), so now we just convert that - // fixed version to UTF-8. This reuses the wellFormed array created earlier, avoiding the - // need to fix the UTF-16 a second time. - [[maybe_unused]] auto written = - simdutf::convert_utf16le_to_utf8(wellFormed.begin(), wellFormed.size(), output); - KJ_DASSERT(written == backing.size()); - }(); // ValueView destroyed here, releasing the heap lock - // Now that ValueView is destroyed and the heap lock is released, it's safe to create V8 objects. - // Create the Uint8Array from the BackingStore and return it to JS. - return jsg::JsUint8Array(backing.createHandle(js).As()); + // Create the Uint8Array from the raw v8::BackingStore. + auto array = + v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, utf8_length); + return jsg::JsUint8Array(array); } TextEncoder::EncodeIntoResult TextEncoder::encodeInto( From 572ffa373da8be27ec97d56b91a4293579c4a35c Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Mon, 3 Nov 2025 16:29:10 -0500 Subject: [PATCH 05/29] apply optimization to improve invalid utf16 --- src/workerd/api/encoding.c++ | 136 ++++++++++++++++++++++++++++++++--- 1 file changed, 127 insertions(+), 9 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 1d2fd33d27c..f3cd2bb8fbb 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -487,6 +487,129 @@ kj::Maybe TextDecoder::decodePtr( // ======================================================================================= // TextEncoder implementation +namespace { + +constexpr inline bool isLeadSurrogate(char16_t c) { + return 0xD800 <= c && c < 0xDC00; +} + +constexpr inline bool isTrailSurrogate(char16_t c) { + return 0xDC00 <= c && c <= 0xDFFF; +} + +// Calculate the number of UTF-8 bytes needed for a single UTF-16 code unit +constexpr inline size_t utf8BytesForCodeUnit(char16_t c) { + if (c < 0x80) return 1; + if (c < 0x800) return 2; + return 3; +} + +// Calculate UTF-8 length from UTF-16 with potentially invalid surrogates. +// Invalid surrogates are counted as U+FFFD (3 bytes in UTF-8). +size_t utf8LengthFromInvalidUtf16(const char16_t* input, size_t length) { + size_t utf8Length = 0; + bool pendingSurrogate = false; + + for (size_t i = 0; i < length; i++) { + char16_t c = input[i]; + + if (pendingSurrogate) { + if (isTrailSurrogate(c)) { + // Valid surrogate pair = 4 bytes in UTF-8 + utf8Length += 4; + pendingSurrogate = false; + } else { + // Unpaired lead surrogate = U+FFFD (3 bytes) + utf8Length += 3; + if (!isLeadSurrogate(c)) { + utf8Length += utf8BytesForCodeUnit(c); + pendingSurrogate = false; + } + } + } else if (isLeadSurrogate(c)) { + pendingSurrogate = true; + } else { + if (isTrailSurrogate(c)) { + // Unpaired trail surrogate = U+FFFD (3 bytes) + utf8Length += 3; + } else { + utf8Length += utf8BytesForCodeUnit(c); + } + } + } + + if (pendingSurrogate) { + utf8Length += 3; // Trailing unpaired lead surrogate + } + + return utf8Length; +} + +// Encode a single UTF-16 code unit to UTF-8 +inline size_t encodeUtf8CodeUnit(char16_t c, char* out) { + if (c < 0x80) { + *out = static_cast(c); + return 1; + } else if (c < 0x800) { + out[0] = static_cast(0xC0 | (c >> 6)); + out[1] = static_cast(0x80 | (c & 0x3F)); + return 2; + } else { + out[0] = static_cast(0xE0 | (c >> 12)); + out[1] = static_cast(0x80 | ((c >> 6) & 0x3F)); + out[2] = static_cast(0x80 | (c & 0x3F)); + return 3; + } +} + +// Encode a valid surrogate pair to UTF-8 +inline void encodeSurrogatePair(char16_t lead, char16_t trail, char* out) { + uint32_t codepoint = 0x10000 + (((lead & 0x3FF) << 10) | (trail & 0x3FF)); + out[0] = static_cast(0xF0 | (codepoint >> 18)); + out[1] = static_cast(0x80 | ((codepoint >> 12) & 0x3F)); + out[2] = static_cast(0x80 | ((codepoint >> 6) & 0x3F)); + out[3] = static_cast(0x80 | (codepoint & 0x3F)); +} + +// Convert UTF-16 with potentially invalid surrogates to UTF-8. +// Invalid surrogates are replaced with U+FFFD. +void convertInvalidUtf16ToUtf8(const char16_t* input, size_t length, char* out) { + size_t position = 0; + bool pendingSurrogate = false; + + for (size_t i = 0; i < length; i++) { + char16_t c = input[i]; + + if (pendingSurrogate) { + if (isTrailSurrogate(c)) { + encodeSurrogatePair(input[i - 1], c, out + position); + position += 4; + pendingSurrogate = false; + } else { + position += encodeUtf8CodeUnit(0xFFFD, out + position); + if (!isLeadSurrogate(c)) { + position += encodeUtf8CodeUnit(c, out + position); + pendingSurrogate = false; + } + } + } else if (isLeadSurrogate(c)) { + pendingSurrogate = true; + } else { + if (isTrailSurrogate(c)) { + position += encodeUtf8CodeUnit(0xFFFD, out + position); + } else { + position += encodeUtf8CodeUnit(c, out + position); + } + } + } + + if (pendingSurrogate) { + encodeUtf8CodeUnit(0xFFFD, out + position); + } +} + +} // namespace + jsg::Ref TextEncoder::constructor(jsg::Lock& js) { return js.alloc(); } @@ -548,17 +671,12 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional(view.length()); - simdutf::to_well_formed_utf16le(data, view.length(), wellFormed.begin()); - utf8_length = simdutf::utf8_length_from_utf16le(wellFormed.begin(), view.length()); + // Use custom conversion that handles invalid surrogates without creating an + // intermediate well-formed UTF-16 buffer. + utf8_length = utf8LengthFromInvalidUtf16(data, view.length()); backingStore = v8::ArrayBuffer::NewBackingStore( js.v8Isolate, utf8_length, v8::BackingStoreInitializationMode::kUninitialized); - [[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8( - wellFormed.begin(), wellFormed.size(), reinterpret_cast(backingStore->Data())); - KJ_DASSERT(written == utf8_length); + convertInvalidUtf16ToUtf8(data, view.length(), reinterpret_cast(backingStore->Data())); } } // ValueView destroyed here, releasing the heap lock From 6ce652b3d17b5d981b2c0ae5ff68d908d0f19e93 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Mon, 3 Nov 2025 16:39:04 -0500 Subject: [PATCH 06/29] add missing simdutf dependency --- src/workerd/api/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/src/workerd/api/BUILD.bazel b/src/workerd/api/BUILD.bazel index 04df1378abe..0254d0994b9 100644 --- a/src/workerd/api/BUILD.bazel +++ b/src/workerd/api/BUILD.bazel @@ -436,6 +436,7 @@ wd_cc_library( implementation_deps = [ "//src/workerd/io:features", "//src/workerd/util:strings", + "@simdutf", ], visibility = ["//visibility:public"], deps = [ From d980b423ac8e532e110f543d4e94c53488111e9f Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 4 Nov 2025 15:21:13 -0500 Subject: [PATCH 07/29] apply review recommendations --- src/workerd/api/encoding.c++ | 63 +++++++++++++++++++----------------- src/workerd/jsg/jsvalue.c++ | 4 --- src/workerd/jsg/jsvalue.h | 2 -- 3 files changed, 33 insertions(+), 36 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index f3cd2bb8fbb..147edbc60b1 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -506,11 +506,11 @@ constexpr inline size_t utf8BytesForCodeUnit(char16_t c) { // Calculate UTF-8 length from UTF-16 with potentially invalid surrogates. // Invalid surrogates are counted as U+FFFD (3 bytes in UTF-8). -size_t utf8LengthFromInvalidUtf16(const char16_t* input, size_t length) { +size_t utf8LengthFromInvalidUtf16(kj::ArrayPtr input) { size_t utf8Length = 0; bool pendingSurrogate = false; - for (size_t i = 0; i < length; i++) { + for (size_t i = 0; i < input.size(); i++) { char16_t c = input[i]; if (pendingSurrogate) { @@ -546,9 +546,9 @@ size_t utf8LengthFromInvalidUtf16(const char16_t* input, size_t length) { } // Encode a single UTF-16 code unit to UTF-8 -inline size_t encodeUtf8CodeUnit(char16_t c, char* out) { +inline size_t encodeUtf8CodeUnit(char16_t c, kj::ArrayPtr out) { if (c < 0x80) { - *out = static_cast(c); + out[0] = static_cast(c); return 1; } else if (c < 0x800) { out[0] = static_cast(0xC0 | (c >> 6)); @@ -563,7 +563,7 @@ inline size_t encodeUtf8CodeUnit(char16_t c, char* out) { } // Encode a valid surrogate pair to UTF-8 -inline void encodeSurrogatePair(char16_t lead, char16_t trail, char* out) { +inline void encodeSurrogatePair(char16_t lead, char16_t trail, kj::ArrayPtr out) { uint32_t codepoint = 0x10000 + (((lead & 0x3FF) << 10) | (trail & 0x3FF)); out[0] = static_cast(0xF0 | (codepoint >> 18)); out[1] = static_cast(0x80 | ((codepoint >> 12) & 0x3F)); @@ -573,22 +573,22 @@ inline void encodeSurrogatePair(char16_t lead, char16_t trail, char* out) { // Convert UTF-16 with potentially invalid surrogates to UTF-8. // Invalid surrogates are replaced with U+FFFD. -void convertInvalidUtf16ToUtf8(const char16_t* input, size_t length, char* out) { +void convertInvalidUtf16ToUtf8(kj::ArrayPtr input, kj::ArrayPtr out) { size_t position = 0; bool pendingSurrogate = false; - for (size_t i = 0; i < length; i++) { + for (size_t i = 0; i < input.size(); i++) { char16_t c = input[i]; if (pendingSurrogate) { if (isTrailSurrogate(c)) { - encodeSurrogatePair(input[i - 1], c, out + position); + encodeSurrogatePair(input[i - 1], c, out.slice(position, out.size())); position += 4; pendingSurrogate = false; } else { - position += encodeUtf8CodeUnit(0xFFFD, out + position); + position += encodeUtf8CodeUnit(0xFFFD, out.slice(position, out.size())); if (!isLeadSurrogate(c)) { - position += encodeUtf8CodeUnit(c, out + position); + position += encodeUtf8CodeUnit(c, out.slice(position, out.size())); pendingSurrogate = false; } } @@ -596,15 +596,15 @@ void convertInvalidUtf16ToUtf8(const char16_t* input, size_t length, char* out) pendingSurrogate = true; } else { if (isTrailSurrogate(c)) { - position += encodeUtf8CodeUnit(0xFFFD, out + position); + position += encodeUtf8CodeUnit(0xFFFD, out.slice(position, out.size())); } else { - position += encodeUtf8CodeUnit(c, out + position); + position += encodeUtf8CodeUnit(c, out.slice(position, out.size())); } } } if (pendingSurrogate) { - encodeUtf8CodeUnit(0xFFFD, out + position); + encodeUtf8CodeUnit(0xFFFD, out.slice(position, out.size())); } } @@ -624,12 +624,12 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional(backingStore->Data()); - str.writeOneByte(js, kj::ArrayPtr(backingData, length), - jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); + [[maybe_unused]] auto writeResult = str.writeInto(js, kj::arrayPtr(backingData, length)); + KJ_DASSERT( + writeResult.written == length, "writeInto must completely overwrite the backing buffer"); utf8_length = simdutf::utf8_length_from_latin1(reinterpret_cast(backingData), length); @@ -640,11 +640,14 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional length); + // Need to convert Latin-1 to UTF-8 - std::shared_ptr backingStore2 = v8::ArrayBuffer::NewBackingStore( - js.v8Isolate, utf8_length, v8::BackingStoreInitializationMode::kUninitialized); - auto written = simdutf::convert_latin1_to_utf8(reinterpret_cast(backingData), - length, reinterpret_cast(backingStore2->Data())); + std::shared_ptr backingStore2 = + js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + [[maybe_unused]] auto written = + simdutf::convert_latin1_to_utf8(reinterpret_cast(backingData), length, + reinterpret_cast(backingStore2->Data())); KJ_DASSERT(utf8_length == written); auto array = v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore2), 0, utf8_length); @@ -658,25 +661,25 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional U+00FF. Check if the UTF-16 is valid (no unpaired surrogates) to determine the path. auto data = reinterpret_cast(view.data16()); - bool isValidUtf16 = simdutf::validate_utf16le(data, view.length()); - if (isValidUtf16) { + if (simdutf::validate_utf16le(data, view.length())) { // Common case: valid UTF-16, convert directly to UTF-8 utf8_length = simdutf::utf8_length_from_utf16le(data, view.length()); - backingStore = v8::ArrayBuffer::NewBackingStore( - js.v8Isolate, utf8_length, v8::BackingStoreInitializationMode::kUninitialized); + backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); [[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8( data, view.length(), reinterpret_cast(backingStore->Data())); KJ_DASSERT(written == utf8_length); } else { - // Rare case: Invalid UTF-16 with unpaired surrogates. Per the Encoding Standard, + // Invalid UTF-16 with unpaired surrogates. Per the Encoding Standard, // unpaired surrogates must be replaced with U+FFFD (replacement character). // Use custom conversion that handles invalid surrogates without creating an // intermediate well-formed UTF-16 buffer. - utf8_length = utf8LengthFromInvalidUtf16(data, view.length()); - backingStore = v8::ArrayBuffer::NewBackingStore( - js.v8Isolate, utf8_length, v8::BackingStoreInitializationMode::kUninitialized); - convertInvalidUtf16ToUtf8(data, view.length(), reinterpret_cast(backingStore->Data())); + auto inputArray = kj::ArrayPtr(data, view.length()); + utf8_length = utf8LengthFromInvalidUtf16(inputArray); + backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + auto outputArray = + kj::ArrayPtr(reinterpret_cast(backingStore->Data()), utf8_length); + convertInvalidUtf16ToUtf8(inputArray, outputArray); } } // ValueView destroyed here, releasing the heap lock diff --git a/src/workerd/jsg/jsvalue.c++ b/src/workerd/jsg/jsvalue.c++ index 4eb3298fec7..20a9614ed93 100644 --- a/src/workerd/jsg/jsvalue.c++ +++ b/src/workerd/jsg/jsvalue.c++ @@ -377,10 +377,6 @@ JsString JsString::internalize(Lock& js) const { return JsString(inner->InternalizeString(js.v8Isolate)); } -void JsString::writeOneByte(Lock& js, kj::ArrayPtr buffer, WriteFlags flags) { - inner->WriteOneByteV2(js.v8Isolate, 0, buffer.size(), buffer.begin(), flags); -} - JsString::WriteIntoStatus JsString::writeInto( Lock& js, kj::ArrayPtr buffer, WriteFlags options) const { WriteIntoStatus result = {0, 0}; diff --git a/src/workerd/jsg/jsvalue.h b/src/workerd/jsg/jsvalue.h index f487f713881..2c9c1f55fde 100644 --- a/src/workerd/jsg/jsvalue.h +++ b/src/workerd/jsg/jsvalue.h @@ -312,8 +312,6 @@ class JsString final: public JsBase { WriteIntoStatus writeInto( Lock& js, kj::ArrayPtr buffer, WriteFlags options = WriteFlags::NONE) const; - void writeOneByte(Lock& js, kj::ArrayPtr buffer, WriteFlags flags = WriteFlags::NONE); - using JsBase::JsBase; }; From c4395650cfb30b1ebba986c3cc819280e1e5885f Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Wed, 12 Nov 2025 12:21:18 -0500 Subject: [PATCH 08/29] optimize encodeInto --- src/workerd/api/encoding.c++ | 139 ++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 4 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 147edbc60b1..5dd02f67148 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -13,6 +13,9 @@ #include #include +#include + +#include #include @@ -690,13 +693,141 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional 0 && adjustedMid < length) { + char16_t prev = data[adjustedMid - 1]; + if (prev >= 0xD800 && prev < 0xDC00) { + adjustedMid--; + } + } + + if (adjustedMid == 0) { + right = 0; + break; + } + + size_t midUtf8Length = simdutf::utf8_length_from_utf16(data, adjustedMid); + if (midUtf8Length <= bufferSize) { + bestFit = adjustedMid; + left = adjustedMid + 1; + } else { + right = adjustedMid - 1; + } + } + + return bestFit; +} + +} // namespace + TextEncoder::EncodeIntoResult TextEncoder::encodeInto( jsg::Lock& js, jsg::JsString input, jsg::JsUint8Array buffer) { - auto result = input.writeInto( - js, buffer.asArrayPtr(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); + auto outputBuf = buffer.asArrayPtr(); + size_t bufferSize = outputBuf.size(); + + // ValueView provides zero-copy access to V8's internal string representation. + // V8 stores strings as either Latin-1 (one byte per character) or UTF-16. + v8::String::ValueView view(js.v8Isolate, input); + uint32_t length = view.length(); + + if (view.is_one_byte()) { + // Latin-1 path: characters 0x00-0x7F encode as 1 UTF-8 byte, 0x80-0xFF as 2 bytes + auto data = reinterpret_cast(view.data8()); + size_t utf8Length = simdutf::utf8_length_from_latin1(data, length); + + if (utf8Length <= bufferSize) { + size_t written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + // Buffer too small - find how many characters fit + size_t bestFit = findBestFitLatin1(data, length, bufferSize); + size_t written = simdutf::convert_latin1_to_utf8(data, bestFit, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(bestFit), + .written = static_cast(written), + }; + } + + // UTF-16 path: check for invalid surrogate pairs first + auto data = reinterpret_cast(view.data16()); + + if (simdutf::validate_utf16(data, length)) { + size_t utf8Length = simdutf::utf8_length_from_utf16(data, length); + if (utf8Length <= bufferSize) { + size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + size_t bestFit = findBestFitUtf16(data, length, bufferSize); + size_t written = simdutf::convert_utf16_to_utf8(data, bestFit, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(bestFit), + .written = static_cast(written), + }; + } + + // Invalid UTF-16: normalize unpaired surrogates to U+FFFD before converting + kj::SmallArray tempBuf(length); + simdutf::to_well_formed_utf16(data, length, tempBuf.begin()); + + size_t utf8Length = simdutf::utf8_length_from_utf16(tempBuf.begin(), length); + if (utf8Length <= bufferSize) { + size_t written = simdutf::convert_utf16_to_utf8(tempBuf.begin(), length, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + size_t bestFit = findBestFitUtf16(tempBuf.begin(), length, bufferSize); + size_t written = simdutf::convert_utf16_to_utf8(tempBuf.begin(), bestFit, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ - .read = static_cast(result.read), - .written = static_cast(result.written), + .read = static_cast(bestFit), + .written = static_cast(written), }; } From abef75cfd39c668a60788f96469a410de30b9c43 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Wed, 12 Nov 2025 14:04:09 -0500 Subject: [PATCH 09/29] optimize ASCII paths --- src/workerd/api/encoding.c++ | 87 +++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 27 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 5dd02f67148..c4848883722 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -576,7 +576,8 @@ inline void encodeSurrogatePair(char16_t lead, char16_t trail, kj::ArrayPtr input, kj::ArrayPtr out) { +// Returns the number of UTF-8 bytes written. +size_t convertInvalidUtf16ToUtf8(kj::ArrayPtr input, kj::ArrayPtr out) { size_t position = 0; bool pendingSurrogate = false; @@ -607,8 +608,10 @@ void convertInvalidUtf16ToUtf8(kj::ArrayPtr input, kj::ArrayPtr< } if (pendingSurrogate) { - encodeUtf8CodeUnit(0xFFFD, out.slice(position, out.size())); + position += encodeUtf8CodeUnit(0xFFFD, out.slice(position, out.size())); } + + return position; } } // namespace @@ -755,6 +758,43 @@ size_t findBestFitUtf16(const char16_t* data, size_t length, size_t bufferSize) return bestFit; } +// Binary search to find how many UTF-16 code units with invalid surrogates fit when converted to UTF-8. +// Ensures surrogate pairs are never split, and unpaired surrogates are replaced with U+FFFD. +size_t findBestFitInvalidUtf16(const char16_t* data, size_t length, size_t bufferSize) { + size_t left = 0; + size_t right = length; + size_t bestFit = 0; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + if (mid == 0) break; + + // Don't split surrogate pairs - adjust backwards if mid lands after a high surrogate + size_t adjustedMid = mid; + if (adjustedMid > 0 && adjustedMid < length) { + char16_t prev = data[adjustedMid - 1]; + if (prev >= 0xD800 && prev < 0xDC00) { + adjustedMid--; + } + } + + if (adjustedMid == 0) { + right = 0; + break; + } + + size_t midUtf8Length = utf8LengthFromInvalidUtf16(kj::arrayPtr(data, adjustedMid)); + if (midUtf8Length <= bufferSize) { + bestFit = adjustedMid; + left = adjustedMid + 1; + } else { + right = adjustedMid - 1; + } + } + + return bestFit; +} + } // namespace TextEncoder::EncodeIntoResult TextEncoder::encodeInto( @@ -762,39 +802,36 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( auto outputBuf = buffer.asArrayPtr(); size_t bufferSize = outputBuf.size(); - // ValueView provides zero-copy access to V8's internal string representation. - // V8 stores strings as either Latin-1 (one byte per character) or UTF-16. v8::String::ValueView view(js.v8Isolate, input); uint32_t length = view.length(); if (view.is_one_byte()) { // Latin-1 path: characters 0x00-0x7F encode as 1 UTF-8 byte, 0x80-0xFF as 2 bytes auto data = reinterpret_cast(view.data8()); - size_t utf8Length = simdutf::utf8_length_from_latin1(data, length); - if (utf8Length <= bufferSize) { - size_t written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(length), - .written = static_cast(written), - }; + // Fast path: avoid length calculation when we can prove the string fits. + // Check worst-case (2x), ASCII (1:1), or calculate exact length as fallback. + size_t read = length; + if (!(length * 2 <= bufferSize || + (length <= bufferSize && simdutf::validate_ascii(data, length)) || + simdutf::utf8_length_from_latin1(data, length) <= bufferSize)) { + // Binary search to find how many characters fit + read = findBestFitLatin1(data, length, bufferSize); } - // Buffer too small - find how many characters fit - size_t bestFit = findBestFitLatin1(data, length, bufferSize); - size_t written = simdutf::convert_latin1_to_utf8(data, bestFit, outputBuf.begin()); + size_t written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ - .read = static_cast(bestFit), + .read = static_cast(read), .written = static_cast(written), }; } - // UTF-16 path: check for invalid surrogate pairs first + // UTF-16 path: validate to ensure spec compliance (replace invalid surrogates with U+FFFD) auto data = reinterpret_cast(view.data16()); if (simdutf::validate_utf16(data, length)) { - size_t utf8Length = simdutf::utf8_length_from_utf16(data, length); - if (utf8Length <= bufferSize) { + // Valid UTF-16: use fast SIMD conversion + if (simdutf::utf8_length_from_utf16(data, length) <= bufferSize) { size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), @@ -810,21 +847,17 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // Invalid UTF-16: normalize unpaired surrogates to U+FFFD before converting - kj::SmallArray tempBuf(length); - simdutf::to_well_formed_utf16(data, length, tempBuf.begin()); - - size_t utf8Length = simdutf::utf8_length_from_utf16(tempBuf.begin(), length); - if (utf8Length <= bufferSize) { - size_t written = simdutf::convert_utf16_to_utf8(tempBuf.begin(), length, outputBuf.begin()); + // Invalid UTF-16: convert directly to UTF-8, replacing unpaired surrogates with U+FFFD + if (utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)) <= bufferSize) { + size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, length), outputBuf); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), .written = static_cast(written), }; } - size_t bestFit = findBestFitUtf16(tempBuf.begin(), length, bufferSize); - size_t written = simdutf::convert_utf16_to_utf8(tempBuf.begin(), bestFit, outputBuf.begin()); + size_t bestFit = findBestFitInvalidUtf16(data, length, bufferSize); + size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, bestFit), outputBuf); return TextEncoder::EncodeIntoResult{ .read = static_cast(bestFit), .written = static_cast(written), From de0de3878e99f60d7e7bb1b403ca4ca243ec493c Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Wed, 12 Nov 2025 14:55:02 -0500 Subject: [PATCH 10/29] add fast path that avoids length calculation --- src/workerd/api/encoding.c++ | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index c4848883722..16117342771 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -831,7 +831,8 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( if (simdutf::validate_utf16(data, length)) { // Valid UTF-16: use fast SIMD conversion - if (simdutf::utf8_length_from_utf16(data, length) <= bufferSize) { + // Fast path: skip length calculation if worst-case UTF-8 size fits (3 bytes per code unit) + if (length * 3 <= bufferSize || simdutf::utf8_length_from_utf16(data, length) <= bufferSize) { size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), @@ -848,7 +849,9 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( } // Invalid UTF-16: convert directly to UTF-8, replacing unpaired surrogates with U+FFFD - if (utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)) <= bufferSize) { + // Fast path: skip length calculation if worst-case UTF-8 size fits (3 bytes per code unit) + if (length * 3 <= bufferSize || + utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)) <= bufferSize) { size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, length), outputBuf); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), From 06b03497a0db1d24949c0ba7fbc8d55371407236 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Wed, 12 Nov 2025 15:12:53 -0500 Subject: [PATCH 11/29] make the code reviewable --- src/workerd/api/encoding.c++ | 152 ++++++++++++++++++----------------- 1 file changed, 77 insertions(+), 75 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 16117342771..2e862c982e1 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -500,51 +500,50 @@ constexpr inline bool isTrailSurrogate(char16_t c) { return 0xDC00 <= c && c <= 0xDFFF; } -// Calculate the number of UTF-8 bytes needed for a single UTF-16 code unit -constexpr inline size_t utf8BytesForCodeUnit(char16_t c) { - if (c < 0x80) return 1; - if (c < 0x800) return 2; - return 3; -} - // Calculate UTF-8 length from UTF-16 with potentially invalid surrogates. // Invalid surrogates are counted as U+FFFD (3 bytes in UTF-8). +// Uses SIMD for valid portions and falls back to scalar for invalid surrogates. size_t utf8LengthFromInvalidUtf16(kj::ArrayPtr input) { + size_t inputPos = 0; size_t utf8Length = 0; - bool pendingSurrogate = false; - for (size_t i = 0; i < input.size(); i++) { - char16_t c = input[i]; + while (inputPos < input.size()) { + // Find the next invalid surrogate using SIMD validation + auto result = + simdutf::validate_utf16_with_errors(input.begin() + inputPos, input.size() - inputPos); + + if (result.error == simdutf::error_code::SUCCESS) { + // Remaining input is valid - calculate length with SIMD + utf8Length += + simdutf::utf8_length_from_utf16(input.begin() + inputPos, input.size() - inputPos); + break; + } + + if (result.error == simdutf::error_code::SURROGATE) { + // Calculate length for the valid portion before the error with SIMD + if (result.count > 0) { + utf8Length += simdutf::utf8_length_from_utf16(input.begin() + inputPos, result.count); + inputPos += result.count; + } - if (pendingSurrogate) { - if (isTrailSurrogate(c)) { + // Handle the invalid surrogate at inputPos + char16_t c = input[inputPos]; + if (isLeadSurrogate(c) && inputPos + 1 < input.size() && + isTrailSurrogate(input[inputPos + 1])) { // Valid surrogate pair = 4 bytes in UTF-8 utf8Length += 4; - pendingSurrogate = false; + inputPos += 2; } else { - // Unpaired lead surrogate = U+FFFD (3 bytes) + // Invalid surrogate = U+FFFD (3 bytes) utf8Length += 3; - if (!isLeadSurrogate(c)) { - utf8Length += utf8BytesForCodeUnit(c); - pendingSurrogate = false; - } + inputPos++; } - } else if (isLeadSurrogate(c)) { - pendingSurrogate = true; } else { - if (isTrailSurrogate(c)) { - // Unpaired trail surrogate = U+FFFD (3 bytes) - utf8Length += 3; - } else { - utf8Length += utf8BytesForCodeUnit(c); - } + // Unexpected error - fall back to scalar calculation for safety + break; } } - if (pendingSurrogate) { - utf8Length += 3; // Trailing unpaired lead surrogate - } - return utf8Length; } @@ -577,41 +576,51 @@ inline void encodeSurrogatePair(char16_t lead, char16_t trail, kj::ArrayPtr input, kj::ArrayPtr out) { - size_t position = 0; - bool pendingSurrogate = false; + size_t inputPos = 0; + size_t outputPos = 0; + + while (inputPos < input.size()) { + // Find the next invalid surrogate using SIMD validation + auto result = + simdutf::validate_utf16_with_errors(input.begin() + inputPos, input.size() - inputPos); + + if (result.error == simdutf::error_code::SUCCESS) { + // Remaining input is valid - convert it all with SIMD + outputPos += simdutf::convert_utf16_to_utf8( + input.begin() + inputPos, input.size() - inputPos, out.begin() + outputPos); + break; + } - for (size_t i = 0; i < input.size(); i++) { - char16_t c = input[i]; + if (result.error == simdutf::error_code::SURROGATE) { + // Convert the valid portion before the error with SIMD + if (result.count > 0) { + outputPos += simdutf::convert_valid_utf16_to_utf8( + input.begin() + inputPos, result.count, out.begin() + outputPos); + inputPos += result.count; + } - if (pendingSurrogate) { - if (isTrailSurrogate(c)) { - encodeSurrogatePair(input[i - 1], c, out.slice(position, out.size())); - position += 4; - pendingSurrogate = false; + // Handle the invalid surrogate at inputPos + char16_t c = input[inputPos]; + if (isLeadSurrogate(c) && inputPos + 1 < input.size() && + isTrailSurrogate(input[inputPos + 1])) { + // Valid surrogate pair - encode it (this shouldn't happen if SURROGATE error) + encodeSurrogatePair(c, input[inputPos + 1], out.slice(outputPos, out.size())); + outputPos += 4; + inputPos += 2; } else { - position += encodeUtf8CodeUnit(0xFFFD, out.slice(position, out.size())); - if (!isLeadSurrogate(c)) { - position += encodeUtf8CodeUnit(c, out.slice(position, out.size())); - pendingSurrogate = false; - } + // Invalid surrogate - replace with U+FFFD (3 bytes) + outputPos += encodeUtf8CodeUnit(0xFFFD, out.slice(outputPos, out.size())); + inputPos++; } - } else if (isLeadSurrogate(c)) { - pendingSurrogate = true; } else { - if (isTrailSurrogate(c)) { - position += encodeUtf8CodeUnit(0xFFFD, out.slice(position, out.size())); - } else { - position += encodeUtf8CodeUnit(c, out.slice(position, out.size())); - } + // Unexpected error - fall back to scalar processing for safety + break; } } - if (pendingSurrogate) { - position += encodeUtf8CodeUnit(0xFFFD, out.slice(position, out.size())); - } - - return position; + return outputPos; } } // namespace @@ -832,37 +841,30 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( if (simdutf::validate_utf16(data, length)) { // Valid UTF-16: use fast SIMD conversion // Fast path: skip length calculation if worst-case UTF-8 size fits (3 bytes per code unit) - if (length * 3 <= bufferSize || simdutf::utf8_length_from_utf16(data, length) <= bufferSize) { - size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(length), - .written = static_cast(written), - }; + size_t read = length; + if (!(length * 3 <= bufferSize || + simdutf::utf8_length_from_utf16(data, length) <= bufferSize)) { + read = findBestFitUtf16(data, length, bufferSize); } - size_t bestFit = findBestFitUtf16(data, length, bufferSize); - size_t written = simdutf::convert_utf16_to_utf8(data, bestFit, outputBuf.begin()); + size_t written = simdutf::convert_utf16_to_utf8(data, read, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ - .read = static_cast(bestFit), + .read = static_cast(read), .written = static_cast(written), }; } // Invalid UTF-16: convert directly to UTF-8, replacing unpaired surrogates with U+FFFD // Fast path: skip length calculation if worst-case UTF-8 size fits (3 bytes per code unit) - if (length * 3 <= bufferSize || - utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)) <= bufferSize) { - size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, length), outputBuf); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(length), - .written = static_cast(written), - }; + size_t read = length; + if (!(length * 3 <= bufferSize || + utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)) <= bufferSize)) { + read = findBestFitInvalidUtf16(data, length, bufferSize); } - size_t bestFit = findBestFitInvalidUtf16(data, length, bufferSize); - size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, bestFit), outputBuf); + size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, read), outputBuf); return TextEncoder::EncodeIntoResult{ - .read = static_cast(bestFit), + .read = static_cast(read), .written = static_cast(written), }; } From 9e892822c036ed8901635ef19829b3e5441bcca2 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 14 Nov 2025 10:19:40 -0500 Subject: [PATCH 12/29] address pr reviews --- src/workerd/api/encoding.c++ | 91 +++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 2e862c982e1..7b1b88cc9f3 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -493,11 +493,11 @@ kj::Maybe TextDecoder::decodePtr( namespace { constexpr inline bool isLeadSurrogate(char16_t c) { - return 0xD800 <= c && c < 0xDC00; + return (c & 0xFC00) == 0xD800; } constexpr inline bool isTrailSurrogate(char16_t c) { - return 0xDC00 <= c && c <= 0xDFFF; + return (c & 0xFC00) == 0xDC00; } // Calculate UTF-8 length from UTF-16 with potentially invalid surrogates. @@ -527,20 +527,18 @@ size_t utf8LengthFromInvalidUtf16(kj::ArrayPtr input) { } // Handle the invalid surrogate at inputPos + // SURROGATE error means unpaired surrogate, so valid pair should be impossible char16_t c = input[inputPos]; - if (isLeadSurrogate(c) && inputPos + 1 < input.size() && - isTrailSurrogate(input[inputPos + 1])) { - // Valid surrogate pair = 4 bytes in UTF-8 - utf8Length += 4; - inputPos += 2; - } else { - // Invalid surrogate = U+FFFD (3 bytes) - utf8Length += 3; - inputPos++; - } + KJ_DASSERT(!(isLeadSurrogate(c) && inputPos + 1 < input.size() && + isTrailSurrogate(input[inputPos + 1])), + "Valid surrogate pair should not trigger SURROGATE error"); + + // Invalid surrogate = U+FFFD (3 bytes) + utf8Length += 3; + inputPos++; } else { - // Unexpected error - fall back to scalar calculation for safety - break; + KJ_FAIL_REQUIRE( + "Unexpected UTF-16 validation error from simdutf", static_cast(result.error)); } } @@ -550,13 +548,16 @@ size_t utf8LengthFromInvalidUtf16(kj::ArrayPtr input) { // Encode a single UTF-16 code unit to UTF-8 inline size_t encodeUtf8CodeUnit(char16_t c, kj::ArrayPtr out) { if (c < 0x80) { + KJ_DASSERT(out.size() >= 1); out[0] = static_cast(c); return 1; } else if (c < 0x800) { + KJ_DASSERT(out.size() >= 2); out[0] = static_cast(0xC0 | (c >> 6)); out[1] = static_cast(0x80 | (c & 0x3F)); return 2; } else { + KJ_DASSERT(out.size() >= 3); out[0] = static_cast(0xE0 | (c >> 12)); out[1] = static_cast(0x80 | ((c >> 6) & 0x3F)); out[2] = static_cast(0x80 | (c & 0x3F)); @@ -564,15 +565,6 @@ inline size_t encodeUtf8CodeUnit(char16_t c, kj::ArrayPtr out) { } } -// Encode a valid surrogate pair to UTF-8 -inline void encodeSurrogatePair(char16_t lead, char16_t trail, kj::ArrayPtr out) { - uint32_t codepoint = 0x10000 + (((lead & 0x3FF) << 10) | (trail & 0x3FF)); - out[0] = static_cast(0xF0 | (codepoint >> 18)); - out[1] = static_cast(0x80 | ((codepoint >> 12) & 0x3F)); - out[2] = static_cast(0x80 | ((codepoint >> 6) & 0x3F)); - out[3] = static_cast(0x80 | (codepoint & 0x3F)); -} - // Convert UTF-16 with potentially invalid surrogates to UTF-8. // Invalid surrogates are replaced with U+FFFD. // Returns the number of UTF-8 bytes written. @@ -590,6 +582,7 @@ size_t convertInvalidUtf16ToUtf8(kj::ArrayPtr input, kj::ArrayPt // Remaining input is valid - convert it all with SIMD outputPos += simdutf::convert_utf16_to_utf8( input.begin() + inputPos, input.size() - inputPos, out.begin() + outputPos); + KJ_DASSERT(outputPos <= out.size()); break; } @@ -598,25 +591,24 @@ size_t convertInvalidUtf16ToUtf8(kj::ArrayPtr input, kj::ArrayPt if (result.count > 0) { outputPos += simdutf::convert_valid_utf16_to_utf8( input.begin() + inputPos, result.count, out.begin() + outputPos); + KJ_DASSERT(outputPos <= out.size()); inputPos += result.count; } // Handle the invalid surrogate at inputPos + // SURROGATE error means unpaired surrogate, so valid pair should be impossible char16_t c = input[inputPos]; - if (isLeadSurrogate(c) && inputPos + 1 < input.size() && - isTrailSurrogate(input[inputPos + 1])) { - // Valid surrogate pair - encode it (this shouldn't happen if SURROGATE error) - encodeSurrogatePair(c, input[inputPos + 1], out.slice(outputPos, out.size())); - outputPos += 4; - inputPos += 2; - } else { - // Invalid surrogate - replace with U+FFFD (3 bytes) - outputPos += encodeUtf8CodeUnit(0xFFFD, out.slice(outputPos, out.size())); - inputPos++; - } + KJ_DASSERT(!(isLeadSurrogate(c) && inputPos + 1 < input.size() && + isTrailSurrogate(input[inputPos + 1])), + "Valid surrogate pair should not trigger SURROGATE error"); + + // Invalid surrogate - replace with U+FFFD (3 bytes) + outputPos += encodeUtf8CodeUnit(0xFFFD, out.slice(outputPos, out.size())); + KJ_DASSERT(outputPos <= out.size()); + inputPos++; } else { - // Unexpected error - fall back to scalar processing for safety - break; + KJ_FAIL_REQUIRE( + "Unexpected UTF-16 validation error from simdutf", static_cast(result.error)); } } @@ -745,7 +737,7 @@ size_t findBestFitUtf16(const char16_t* data, size_t length, size_t bufferSize) size_t adjustedMid = mid; if (adjustedMid > 0 && adjustedMid < length) { char16_t prev = data[adjustedMid - 1]; - if (prev >= 0xD800 && prev < 0xDC00) { + if (isLeadSurrogate(prev)) { adjustedMid--; } } @@ -782,7 +774,7 @@ size_t findBestFitInvalidUtf16(const char16_t* data, size_t length, size_t buffe size_t adjustedMid = mid; if (adjustedMid > 0 && adjustedMid < length) { char16_t prev = data[adjustedMid - 1]; - if (prev >= 0xD800 && prev < 0xDC00) { + if (isLeadSurrogate(prev)) { adjustedMid--; } } @@ -818,16 +810,27 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( // Latin-1 path: characters 0x00-0x7F encode as 1 UTF-8 byte, 0x80-0xFF as 2 bytes auto data = reinterpret_cast(view.data8()); - // Fast path: avoid length calculation when we can prove the string fits. - // Check worst-case (2x), ASCII (1:1), or calculate exact length as fallback. + // Determine if we need binary search using short-circuit evaluation to minimize checks size_t read = length; - if (!(length * 2 <= bufferSize || - (length <= bufferSize && simdutf::validate_ascii(data, length)) || - simdutf::utf8_length_from_latin1(data, length) <= bufferSize)) { - // Binary search to find how many characters fit + size_t utf8Length = 0; + bool needsBinarySearch = !(length * 2 <= bufferSize || // Fast: worst-case (2x) fits + (length <= bufferSize && simdutf::validate_ascii(data, length)) || // ASCII check + (utf8Length = simdutf::utf8_length_from_latin1(data, length)) <= bufferSize // Exact length + ); + + if (needsBinarySearch) { read = findBestFitLatin1(data, length, bufferSize); } + // ASCII fast path: use memcpy instead of conversion + if (utf8Length == length || (utf8Length == 0 && simdutf::validate_ascii(data, read))) { + memcpy(outputBuf.begin(), data, read); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(read), + .written = static_cast(read), + }; + } + size_t written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(read), From 2bfb85a6dffa1739623146c4702fc07e4d9ed7a1 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 14 Nov 2025 10:49:28 -0500 Subject: [PATCH 13/29] more optimizations --- src/workerd/api/encoding.c++ | 370 +++++++++++++++++++++++++---------- 1 file changed, 267 insertions(+), 103 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 7b1b88cc9f3..74fe800b49c 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -496,7 +496,7 @@ constexpr inline bool isLeadSurrogate(char16_t c) { return (c & 0xFC00) == 0xD800; } -constexpr inline bool isTrailSurrogate(char16_t c) { +[[maybe_unused]] constexpr inline bool isTrailSurrogate(char16_t c) { return (c & 0xFC00) == 0xDC00; } @@ -629,20 +629,22 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional(backingStore->Data()); + // Use off-heap allocation for intermediate Latin-1 buffer to avoid wasting V8 heap space + // and potentially triggering GC. Stack allocation for small strings, heap for large. + kj::SmallArray latin1Buffer(length); - [[maybe_unused]] auto writeResult = str.writeInto(js, kj::arrayPtr(backingData, length)); + [[maybe_unused]] auto writeResult = str.writeInto(js, latin1Buffer.asPtr()); KJ_DASSERT( writeResult.written == length, "writeInto must completely overwrite the backing buffer"); - utf8_length = - simdutf::utf8_length_from_latin1(reinterpret_cast(backingData), length); + utf8_length = simdutf::utf8_length_from_latin1( + reinterpret_cast(latin1Buffer.begin()), length); if (utf8_length == length) { // ASCII fast path: no conversion needed, Latin-1 is same as UTF-8 for ASCII + // Allocate final on-heap buffer and copy + backingStore = js.allocBackingStore(length, jsg::Lock::AllocOption::UNINITIALIZED); + memcpy(backingStore->Data(), latin1Buffer.begin(), length); auto array = v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, length); return jsg::JsUint8Array(array); } @@ -650,14 +652,13 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional length); // Need to convert Latin-1 to UTF-8 - std::shared_ptr backingStore2 = - js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); [[maybe_unused]] auto written = - simdutf::convert_latin1_to_utf8(reinterpret_cast(backingData), length, - reinterpret_cast(backingStore2->Data())); + simdutf::convert_latin1_to_utf8(reinterpret_cast(latin1Buffer.begin()), length, + reinterpret_cast(backingStore->Data())); KJ_DASSERT(utf8_length == written); auto array = - v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore2), 0, utf8_length); + v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, utf8_length); return jsg::JsUint8Array(array); } @@ -699,101 +700,192 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional bufferSize) { + // This chunk would overflow - binary search within this chunk + size_t left = 0; + size_t right = chunkSize; + size_t bestFit = 0; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + if (mid == 0) break; + + size_t midUtf8Length = simdutf::utf8_length_from_latin1(data + pos, mid); + if (utf8Accumulated + midUtf8Length <= bufferSize) { + bestFit = mid; + left = mid + 1; + } else { + right = mid - 1; + } + } + + return pos + bestFit; } + + utf8Accumulated += chunkUtf8Len; + pos += chunkSize; } - return bestFit; + return pos; } -// Binary search to find how many UTF-16 code units fit when converted to UTF-8. +// Forward scan to find how many UTF-16 code units fit when converted to UTF-8. +// Uses SIMD for fast processing while maintaining O(result) complexity. // Ensures surrogate pairs (0xD800-0xDFFF) are never split across the boundary. size_t findBestFitUtf16(const char16_t* data, size_t length, size_t bufferSize) { - size_t left = 0; - size_t right = length; - size_t bestFit = 0; - - while (left <= right) { - size_t mid = left + (right - left) / 2; - if (mid == 0) break; - - // Don't split surrogate pairs - adjust backwards if mid lands after a high surrogate - size_t adjustedMid = mid; - if (adjustedMid > 0 && adjustedMid < length) { - char16_t prev = data[adjustedMid - 1]; - if (isLeadSurrogate(prev)) { - adjustedMid--; + size_t pos = 0; + size_t utf8Accumulated = 0; + + // Process in chunks using SIMD for speed + constexpr size_t CHUNK = 256; + + while (pos < length) { + size_t remaining = length - pos; + size_t chunkSize = remaining < CHUNK ? remaining : CHUNK; + + // Adjust chunk to not split surrogate pairs + if (pos + chunkSize < length && chunkSize > 0) { + char16_t last = data[pos + chunkSize - 1]; + if (isLeadSurrogate(last)) { + chunkSize--; } } - if (adjustedMid == 0) { - right = 0; - break; + if (chunkSize == 0) { + // Edge case: chunk would be empty, process at least 2 code units (surrogate pair) + chunkSize = (remaining >= 2) ? 2 : remaining; } - size_t midUtf8Length = simdutf::utf8_length_from_utf16(data, adjustedMid); - if (midUtf8Length <= bufferSize) { - bestFit = adjustedMid; - left = adjustedMid + 1; - } else { - right = adjustedMid - 1; + size_t chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize); + + if (utf8Accumulated + chunkUtf8Len > bufferSize) { + // This chunk would overflow - binary search within this chunk + size_t left = 0; + size_t right = chunkSize; + size_t bestFit = 0; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + if (mid == 0) break; + + // Don't split surrogate pairs + size_t adjustedMid = mid; + if (adjustedMid > 0 && pos + adjustedMid < length) { + char16_t prev = data[pos + adjustedMid - 1]; + if (isLeadSurrogate(prev)) { + adjustedMid--; + } + } + + if (adjustedMid == 0) { + right = 0; + break; + } + + size_t midUtf8Length = simdutf::utf8_length_from_utf16(data + pos, adjustedMid); + if (utf8Accumulated + midUtf8Length <= bufferSize) { + bestFit = adjustedMid; + left = adjustedMid + 1; + } else { + right = adjustedMid - 1; + } + } + + return pos + bestFit; } + + utf8Accumulated += chunkUtf8Len; + pos += chunkSize; } - return bestFit; + return pos; } -// Binary search to find how many UTF-16 code units with invalid surrogates fit when converted to UTF-8. +// Forward scan to find how many UTF-16 code units with invalid surrogates fit when converted to UTF-8. +// Uses SIMD for fast processing while maintaining O(result) complexity. // Ensures surrogate pairs are never split, and unpaired surrogates are replaced with U+FFFD. size_t findBestFitInvalidUtf16(const char16_t* data, size_t length, size_t bufferSize) { - size_t left = 0; - size_t right = length; - size_t bestFit = 0; - - while (left <= right) { - size_t mid = left + (right - left) / 2; - if (mid == 0) break; - - // Don't split surrogate pairs - adjust backwards if mid lands after a high surrogate - size_t adjustedMid = mid; - if (adjustedMid > 0 && adjustedMid < length) { - char16_t prev = data[adjustedMid - 1]; - if (isLeadSurrogate(prev)) { - adjustedMid--; + size_t pos = 0; + size_t utf8Accumulated = 0; + + // Process in chunks using SIMD for speed + constexpr size_t CHUNK = 256; + + while (pos < length) { + size_t remaining = length - pos; + size_t chunkSize = remaining < CHUNK ? remaining : CHUNK; + + // Adjust chunk to not split surrogate pairs + if (pos + chunkSize < length && chunkSize > 0) { + char16_t last = data[pos + chunkSize - 1]; + if (isLeadSurrogate(last)) { + chunkSize--; } } - if (adjustedMid == 0) { - right = 0; - break; + if (chunkSize == 0) { + // Edge case: chunk would be empty, process at least 2 code units (surrogate pair) + chunkSize = (remaining >= 2) ? 2 : remaining; } - size_t midUtf8Length = utf8LengthFromInvalidUtf16(kj::arrayPtr(data, adjustedMid)); - if (midUtf8Length <= bufferSize) { - bestFit = adjustedMid; - left = adjustedMid + 1; - } else { - right = adjustedMid - 1; + size_t chunkUtf8Len = utf8LengthFromInvalidUtf16(kj::arrayPtr(data + pos, chunkSize)); + + if (utf8Accumulated + chunkUtf8Len > bufferSize) { + // This chunk would overflow - binary search within this chunk + size_t left = 0; + size_t right = chunkSize; + size_t bestFit = 0; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + if (mid == 0) break; + + // Don't split surrogate pairs + size_t adjustedMid = mid; + if (adjustedMid > 0 && pos + adjustedMid < length) { + char16_t prev = data[pos + adjustedMid - 1]; + if (isLeadSurrogate(prev)) { + adjustedMid--; + } + } + + if (adjustedMid == 0) { + right = 0; + break; + } + + size_t midUtf8Length = utf8LengthFromInvalidUtf16(kj::arrayPtr(data + pos, adjustedMid)); + if (utf8Accumulated + midUtf8Length <= bufferSize) { + bestFit = adjustedMid; + left = adjustedMid + 1; + } else { + right = adjustedMid - 1; + } + } + + return pos + bestFit; } + + utf8Accumulated += chunkUtf8Len; + pos += chunkSize; } - return bestFit; + return pos; } } // namespace @@ -810,27 +902,49 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( // Latin-1 path: characters 0x00-0x7F encode as 1 UTF-8 byte, 0x80-0xFF as 2 bytes auto data = reinterpret_cast(view.data8()); - // Determine if we need binary search using short-circuit evaluation to minimize checks - size_t read = length; - size_t utf8Length = 0; - bool needsBinarySearch = !(length * 2 <= bufferSize || // Fast: worst-case (2x) fits - (length <= bufferSize && simdutf::validate_ascii(data, length)) || // ASCII check - (utf8Length = simdutf::utf8_length_from_latin1(data, length)) <= bufferSize // Exact length - ); + // Optimize for incremental encoding: if buffer is much smaller than input, + // skip all "whole string fits" checks and go straight to forward scan + if (length > bufferSize * 2) { + // Incremental mode: forward scan to find what fits, then convert + size_t read = findBestFitLatin1(data, length, bufferSize); + size_t written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(read), + .written = static_cast(written), + }; + } - if (needsBinarySearch) { - read = findBestFitLatin1(data, length, bufferSize); + // Buffer might fit most/all of string: try optimized fast paths + // Fast path 1: Worst-case (2x) definitely fits + if (length * 2 <= bufferSize) { + size_t written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; } - // ASCII fast path: use memcpy instead of conversion - if (utf8Length == length || (utf8Length == 0 && simdutf::validate_ascii(data, read))) { - memcpy(outputBuf.begin(), data, read); + // Fast path 2: Check if ASCII (which is 1:1 Latin-1 to UTF-8) + if (length <= bufferSize && simdutf::validate_ascii(data, length)) { + memcpy(outputBuf.begin(), data, length); return TextEncoder::EncodeIntoResult{ - .read = static_cast(read), - .written = static_cast(read), + .read = static_cast(length), + .written = static_cast(length), }; } + // Slow path: Calculate exact UTF-8 length to determine if it fits + size_t utf8Length = simdutf::utf8_length_from_latin1(data, length); + if (utf8Length <= bufferSize) { + size_t written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + // Doesn't fit: forward scan to find what does + size_t read = findBestFitLatin1(data, length, bufferSize); size_t written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(read), @@ -843,13 +957,38 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( if (simdutf::validate_utf16(data, length)) { // Valid UTF-16: use fast SIMD conversion - // Fast path: skip length calculation if worst-case UTF-8 size fits (3 bytes per code unit) - size_t read = length; - if (!(length * 3 <= bufferSize || - simdutf::utf8_length_from_utf16(data, length) <= bufferSize)) { - read = findBestFitUtf16(data, length, bufferSize); + + // Incremental mode: buffer much smaller than input, skip "whole string fits" checks + if (length > bufferSize) { + size_t read = findBestFitUtf16(data, length, bufferSize); + size_t written = simdutf::convert_utf16_to_utf8(data, read, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(read), + .written = static_cast(written), + }; + } + + // Fast path: worst-case (3 bytes per UTF-16 code unit) fits + if (length * 3 <= bufferSize) { + size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + // Slow path: calculate exact UTF-8 length + size_t utf8Length = simdutf::utf8_length_from_utf16(data, length); + if (utf8Length <= bufferSize) { + size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; } + // Doesn't fit: forward scan to find what does + size_t read = findBestFitUtf16(data, length, bufferSize); size_t written = simdutf::convert_utf16_to_utf8(data, read, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(read), @@ -858,13 +997,38 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( } // Invalid UTF-16: convert directly to UTF-8, replacing unpaired surrogates with U+FFFD - // Fast path: skip length calculation if worst-case UTF-8 size fits (3 bytes per code unit) - size_t read = length; - if (!(length * 3 <= bufferSize || - utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)) <= bufferSize)) { - read = findBestFitInvalidUtf16(data, length, bufferSize); + + // Incremental mode: buffer much smaller than input, skip "whole string fits" checks + if (length > bufferSize) { + size_t read = findBestFitInvalidUtf16(data, length, bufferSize); + size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, read), outputBuf); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(read), + .written = static_cast(written), + }; + } + + // Fast path: worst-case (3 bytes per UTF-16 code unit) fits + if (length * 3 <= bufferSize) { + size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, length), outputBuf); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + // Slow path: calculate exact UTF-8 length + size_t utf8Length = utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)); + if (utf8Length <= bufferSize) { + size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, length), outputBuf); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; } + // Doesn't fit: forward scan to find what does + size_t read = findBestFitInvalidUtf16(data, length, bufferSize); size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, read), outputBuf); return TextEncoder::EncodeIntoResult{ .read = static_cast(read), From 022e1a2cc8168ceb751b5da99afe1d11cd953211 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 14 Nov 2025 11:06:19 -0500 Subject: [PATCH 14/29] make the code reviewable --- src/workerd/api/encoding.c++ | 53 +++++++++++++++--------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 74fe800b49c..353a44c776d 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -700,14 +700,11 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional bufferSize) { - // This chunk would overflow - binary search within this chunk + // Chunk would overflow - binary search within chunk size_t left = 0; size_t right = chunkSize; size_t bestFit = 0; @@ -744,21 +741,19 @@ size_t findBestFitLatin1(const char* data, size_t length, size_t bufferSize) { return pos; } -// Forward scan to find how many UTF-16 code units fit when converted to UTF-8. -// Uses SIMD for fast processing while maintaining O(result) complexity. -// Ensures surrogate pairs (0xD800-0xDFFF) are never split across the boundary. +// Find how many UTF-16 code units fit when converted to UTF-8 +// Uses chunked forward scan with SIMD, O(result) complexity. Never splits surrogate pairs. size_t findBestFitUtf16(const char16_t* data, size_t length, size_t bufferSize) { size_t pos = 0; size_t utf8Accumulated = 0; - // Process in chunks using SIMD for speed constexpr size_t CHUNK = 256; while (pos < length) { size_t remaining = length - pos; size_t chunkSize = remaining < CHUNK ? remaining : CHUNK; - // Adjust chunk to not split surrogate pairs + // Don't split surrogate pairs at chunk boundary if (pos + chunkSize < length && chunkSize > 0) { char16_t last = data[pos + chunkSize - 1]; if (isLeadSurrogate(last)) { @@ -767,14 +762,13 @@ size_t findBestFitUtf16(const char16_t* data, size_t length, size_t bufferSize) } if (chunkSize == 0) { - // Edge case: chunk would be empty, process at least 2 code units (surrogate pair) chunkSize = (remaining >= 2) ? 2 : remaining; } size_t chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize); if (utf8Accumulated + chunkUtf8Len > bufferSize) { - // This chunk would overflow - binary search within this chunk + // Chunk would overflow - binary search within chunk size_t left = 0; size_t right = chunkSize; size_t bestFit = 0; @@ -816,21 +810,19 @@ size_t findBestFitUtf16(const char16_t* data, size_t length, size_t bufferSize) return pos; } -// Forward scan to find how many UTF-16 code units with invalid surrogates fit when converted to UTF-8. -// Uses SIMD for fast processing while maintaining O(result) complexity. -// Ensures surrogate pairs are never split, and unpaired surrogates are replaced with U+FFFD. +// Find how many UTF-16 code units with invalid surrogates fit when converted to UTF-8 +// Uses chunked forward scan with SIMD, O(result) complexity. Never splits surrogate pairs. +// Unpaired surrogates replaced with U+FFFD. size_t findBestFitInvalidUtf16(const char16_t* data, size_t length, size_t bufferSize) { size_t pos = 0; size_t utf8Accumulated = 0; - - // Process in chunks using SIMD for speed constexpr size_t CHUNK = 256; while (pos < length) { size_t remaining = length - pos; size_t chunkSize = remaining < CHUNK ? remaining : CHUNK; - // Adjust chunk to not split surrogate pairs + // Don't split surrogate pairs at chunk boundary if (pos + chunkSize < length && chunkSize > 0) { char16_t last = data[pos + chunkSize - 1]; if (isLeadSurrogate(last)) { @@ -839,14 +831,13 @@ size_t findBestFitInvalidUtf16(const char16_t* data, size_t length, size_t buffe } if (chunkSize == 0) { - // Edge case: chunk would be empty, process at least 2 code units (surrogate pair) chunkSize = (remaining >= 2) ? 2 : remaining; } size_t chunkUtf8Len = utf8LengthFromInvalidUtf16(kj::arrayPtr(data + pos, chunkSize)); if (utf8Accumulated + chunkUtf8Len > bufferSize) { - // This chunk would overflow - binary search within this chunk + // Chunk would overflow - binary search within chunk size_t left = 0; size_t right = chunkSize; size_t bestFit = 0; @@ -924,18 +915,18 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // Fast path 2: Check if ASCII (which is 1:1 Latin-1 to UTF-8) - if (length <= bufferSize && simdutf::validate_ascii(data, length)) { - memcpy(outputBuf.begin(), data, length); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(length), - .written = static_cast(length), - }; - } - - // Slow path: Calculate exact UTF-8 length to determine if it fits + // Calculate exact UTF-8 length to determine if it fits size_t utf8Length = simdutf::utf8_length_from_latin1(data, length); if (utf8Length <= bufferSize) { + // Fast path 2: ASCII (utf8Length == length means no conversion needed) + if (utf8Length == length) { + memcpy(outputBuf.begin(), data, length); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(length), + }; + } + // Fits: convert with SIMD size_t written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), From 69829217df7c46c0096d2ee771910e71b67db388 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 14 Nov 2025 11:15:04 -0500 Subject: [PATCH 15/29] use simdutf trim_partial_utf16 --- src/workerd/api/encoding.c++ | 44 ++++++------------------------------ 1 file changed, 7 insertions(+), 37 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 353a44c776d..d6c84fb001a 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -492,7 +492,7 @@ kj::Maybe TextDecoder::decodePtr( namespace { -constexpr inline bool isLeadSurrogate(char16_t c) { +[[maybe_unused]] constexpr inline bool isLeadSurrogate(char16_t c) { return (c & 0xFC00) == 0xD800; } @@ -528,7 +528,7 @@ size_t utf8LengthFromInvalidUtf16(kj::ArrayPtr input) { // Handle the invalid surrogate at inputPos // SURROGATE error means unpaired surrogate, so valid pair should be impossible - char16_t c = input[inputPos]; + [[maybe_unused]] char16_t c = input[inputPos]; KJ_DASSERT(!(isLeadSurrogate(c) && inputPos + 1 < input.size() && isTrailSurrogate(input[inputPos + 1])), "Valid surrogate pair should not trigger SURROGATE error"); @@ -597,7 +597,7 @@ size_t convertInvalidUtf16ToUtf8(kj::ArrayPtr input, kj::ArrayPt // Handle the invalid surrogate at inputPos // SURROGATE error means unpaired surrogate, so valid pair should be impossible - char16_t c = input[inputPos]; + [[maybe_unused]] char16_t c = input[inputPos]; KJ_DASSERT(!(isLeadSurrogate(c) && inputPos + 1 < input.size() && isTrailSurrogate(input[inputPos + 1])), "Valid surrogate pair should not trigger SURROGATE error"); @@ -751,15 +751,7 @@ size_t findBestFitUtf16(const char16_t* data, size_t length, size_t bufferSize) while (pos < length) { size_t remaining = length - pos; - size_t chunkSize = remaining < CHUNK ? remaining : CHUNK; - - // Don't split surrogate pairs at chunk boundary - if (pos + chunkSize < length && chunkSize > 0) { - char16_t last = data[pos + chunkSize - 1]; - if (isLeadSurrogate(last)) { - chunkSize--; - } - } + size_t chunkSize = simdutf::trim_partial_utf16(data + pos, kj::min(remaining, CHUNK)); if (chunkSize == 0) { chunkSize = (remaining >= 2) ? 2 : remaining; @@ -777,14 +769,7 @@ size_t findBestFitUtf16(const char16_t* data, size_t length, size_t bufferSize) size_t mid = left + (right - left) / 2; if (mid == 0) break; - // Don't split surrogate pairs - size_t adjustedMid = mid; - if (adjustedMid > 0 && pos + adjustedMid < length) { - char16_t prev = data[pos + adjustedMid - 1]; - if (isLeadSurrogate(prev)) { - adjustedMid--; - } - } + size_t adjustedMid = simdutf::trim_partial_utf16(data + pos, mid); if (adjustedMid == 0) { right = 0; @@ -820,15 +805,7 @@ size_t findBestFitInvalidUtf16(const char16_t* data, size_t length, size_t buffe while (pos < length) { size_t remaining = length - pos; - size_t chunkSize = remaining < CHUNK ? remaining : CHUNK; - - // Don't split surrogate pairs at chunk boundary - if (pos + chunkSize < length && chunkSize > 0) { - char16_t last = data[pos + chunkSize - 1]; - if (isLeadSurrogate(last)) { - chunkSize--; - } - } + size_t chunkSize = simdutf::trim_partial_utf16(data + pos, kj::min(remaining, CHUNK)); if (chunkSize == 0) { chunkSize = (remaining >= 2) ? 2 : remaining; @@ -846,14 +823,7 @@ size_t findBestFitInvalidUtf16(const char16_t* data, size_t length, size_t buffe size_t mid = left + (right - left) / 2; if (mid == 0) break; - // Don't split surrogate pairs - size_t adjustedMid = mid; - if (adjustedMid > 0 && pos + adjustedMid < length) { - char16_t prev = data[pos + adjustedMid - 1]; - if (isLeadSurrogate(prev)) { - adjustedMid--; - } - } + size_t adjustedMid = simdutf::trim_partial_utf16(data + pos, mid); if (adjustedMid == 0) { right = 0; From 525cbacc858774128b6631c3cea714fd18af9180 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 14 Nov 2025 11:46:58 -0500 Subject: [PATCH 16/29] avoid repetitive simdutf_length calls --- src/workerd/api/encoding.c++ | 70 +++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index d6c84fb001a..a5e6053bd19 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -702,7 +702,10 @@ namespace { // Find how many Latin-1 characters fit when converted to UTF-8 // Uses chunked forward scan with SIMD, O(result) complexity -size_t findBestFitLatin1(const char* data, size_t length, size_t bufferSize) { +// Template parameter ReturnLength controls whether to return just position or (position, utf8_length) +template +std::conditional_t, size_t> findBestFitLatin1( + const char* data, size_t length, size_t bufferSize) { size_t pos = 0; size_t utf8Accumulated = 0; constexpr size_t CHUNK = 256; @@ -731,22 +734,35 @@ size_t findBestFitLatin1(const char* data, size_t length, size_t bufferSize) { } } - return pos + bestFit; + if constexpr (ReturnLength) { + size_t finalPos = pos + bestFit; + size_t finalUtf8Len = + utf8Accumulated + simdutf::utf8_length_from_latin1(data + pos, bestFit); + return {finalPos, finalUtf8Len}; + } else { + return pos + bestFit; + } } utf8Accumulated += chunkUtf8Len; pos += chunkSize; } - return pos; + if constexpr (ReturnLength) { + return {pos, utf8Accumulated}; + } else { + return pos; + } } // Find how many UTF-16 code units fit when converted to UTF-8 // Uses chunked forward scan with SIMD, O(result) complexity. Never splits surrogate pairs. -size_t findBestFitUtf16(const char16_t* data, size_t length, size_t bufferSize) { +// Template parameter ReturnLength controls whether to return just position or (position, utf8_length) +template +std::conditional_t, size_t> findBestFitUtf16( + const char16_t* data, size_t length, size_t bufferSize) { size_t pos = 0; size_t utf8Accumulated = 0; - constexpr size_t CHUNK = 256; while (pos < length) { @@ -785,14 +801,25 @@ size_t findBestFitUtf16(const char16_t* data, size_t length, size_t bufferSize) } } - return pos + bestFit; + if constexpr (ReturnLength) { + size_t finalPos = pos + bestFit; + size_t finalUtf8Len = + utf8Accumulated + simdutf::utf8_length_from_utf16(data + pos, bestFit); + return {finalPos, finalUtf8Len}; + } else { + return pos + bestFit; + } } utf8Accumulated += chunkUtf8Len; pos += chunkSize; } - return pos; + if constexpr (ReturnLength) { + return {pos, utf8Accumulated}; + } else { + return pos; + } } // Find how many UTF-16 code units with invalid surrogates fit when converted to UTF-8 @@ -875,8 +902,7 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // Buffer might fit most/all of string: try optimized fast paths - // Fast path 1: Worst-case (2x) definitely fits + // Fast path: Worst-case (2x) definitely fits if (length * 2 <= bufferSize) { size_t written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ @@ -885,10 +911,12 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // Calculate exact UTF-8 length to determine if it fits - size_t utf8Length = simdutf::utf8_length_from_latin1(data, length); - if (utf8Length <= bufferSize) { - // Fast path 2: ASCII (utf8Length == length means no conversion needed) + // Use forward scan that also returns UTF-8 length (avoids redundant full-string scan) + auto [read, utf8Length] = findBestFitLatin1(data, length, bufferSize); + + // Check if everything fit + if (read == length) { + // ASCII fast path: utf8Length == length means no conversion needed if (utf8Length == length) { memcpy(outputBuf.begin(), data, length); return TextEncoder::EncodeIntoResult{ @@ -896,7 +924,7 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( .written = static_cast(length), }; } - // Fits: convert with SIMD + // All fit: convert with SIMD size_t written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), @@ -904,8 +932,7 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // Doesn't fit: forward scan to find what does - size_t read = findBestFitLatin1(data, length, bufferSize); + // Partial fit: convert only what fits size_t written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(read), @@ -938,9 +965,11 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // Slow path: calculate exact UTF-8 length - size_t utf8Length = simdutf::utf8_length_from_utf16(data, length); - if (utf8Length <= bufferSize) { + // Use forward scan that also returns UTF-8 length (avoids redundant full-string scan) + auto [read, utf8Length] = findBestFitUtf16(data, length, bufferSize); + + if (read == length) { + // Everything fit: convert all size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), @@ -948,8 +977,7 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // Doesn't fit: forward scan to find what does - size_t read = findBestFitUtf16(data, length, bufferSize); + // Partial fit: convert only what fits size_t written = simdutf::convert_utf16_to_utf8(data, read, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(read), From 538ed749a0849d43dd183bf7ec160b6032f346d1 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 14 Nov 2025 15:33:04 -0500 Subject: [PATCH 17/29] get rid of string flattening --- src/workerd/api/encoding.c++ | 70 ++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index a5e6053bd19..de0fecb144b 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -625,10 +625,10 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional backingStore; size_t utf8_length = 0; + auto length = str.length(js); // Fast path: check if string is one-byte before creating ValueView if (str.isOneByte(js)) { - auto length = str.length(js); // Use off-heap allocation for intermediate Latin-1 buffer to avoid wasting V8 heap space // and potentially triggering GC. Stack allocation for small strings, heap for large. kj::SmallArray latin1Buffer(length); @@ -663,36 +663,28 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional U+00FF. Check if the UTF-16 is valid (no unpaired surrogates) to determine the path. - auto data = reinterpret_cast(view.data16()); - - if (simdutf::validate_utf16le(data, view.length())) { - // Common case: valid UTF-16, convert directly to UTF-8 - utf8_length = simdutf::utf8_length_from_utf16le(data, view.length()); - backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); - [[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8( - data, view.length(), reinterpret_cast(backingStore->Data())); - KJ_DASSERT(written == utf8_length); - } else { - // Invalid UTF-16 with unpaired surrogates. Per the Encoding Standard, - // unpaired surrogates must be replaced with U+FFFD (replacement character). - // Use custom conversion that handles invalid surrogates without creating an - // intermediate well-formed UTF-16 buffer. - auto inputArray = kj::ArrayPtr(data, view.length()); - utf8_length = utf8LengthFromInvalidUtf16(inputArray); - backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); - auto outputArray = - kj::ArrayPtr(reinterpret_cast(backingStore->Data()), utf8_length); - convertInvalidUtf16ToUtf8(inputArray, outputArray); - } - } // ValueView destroyed here, releasing the heap lock + // Use off-heap allocation for intermediate UTF-16 buffer to avoid triggering GC. + // Stack allocation for small strings, heap for large. + kj::SmallArray utf16Buffer(length); + + // Note: writeInto() doesn't flatten the string - it calls writeTo() which chains through + // Write2 -> WriteV2 -> WriteHelperV2 -> String::WriteToFlat (written by Erik in 2008). + // This means we may read from multiple string segments, but that's fine for our use case. + [[maybe_unused]] auto writeResult = str.writeInto(js, utf16Buffer.asPtr()); + KJ_DASSERT( + writeResult.written == length, "writeInto must completely overwrite the backing buffer"); + + auto data = reinterpret_cast(utf16Buffer.begin()); + utf8_length = utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)); + + if (!simdutf::validate_utf16(data, length)) { + simdutf::to_well_formed_utf16(data, length, data); + } + + backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + [[maybe_unused]] auto written = simdutf::convert_valid_utf16_to_utf8( + data, length, reinterpret_cast(backingStore->Data())); - // Now that ValueView is destroyed and the heap lock is released, it's safe to create V8 objects. - // Create the Uint8Array from the raw v8::BackingStore. auto array = v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, utf8_length); return jsg::JsUint8Array(array); @@ -911,12 +903,15 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // Use forward scan that also returns UTF-8 length (avoids redundant full-string scan) + // "Maybe fits" zone: bufferSize < length*2, but might still fit entirely. + // Use forward scan with ReturnLength=true to get both position and UTF-8 length. + // This avoids redundant work: if we called utf8_length_from_latin1() to check if it fits, + // then called findBestFitLatin1() when it doesn't, we'd scan the string twice. auto [read, utf8Length] = findBestFitLatin1(data, length, bufferSize); // Check if everything fit if (read == length) { - // ASCII fast path: utf8Length == length means no conversion needed + // ASCII fast path: utf8Length == length means all chars are ASCII, no conversion needed if (utf8Length == length) { memcpy(outputBuf.begin(), data, length); return TextEncoder::EncodeIntoResult{ @@ -924,8 +919,8 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( .written = static_cast(length), }; } - // All fit: convert with SIMD - size_t written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); + + auto written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), .written = static_cast(written), @@ -965,11 +960,14 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // Use forward scan that also returns UTF-8 length (avoids redundant full-string scan) + // "Maybe fits" zone: bufferSize < length*3, but might still fit entirely. + // Use forward scan with ReturnLength=true to get both position and UTF-8 length. + // This avoids redundant work: if we called utf8_length_from_utf16() to check if it fits, + // then called findBestFitUtf16() when it doesn't, we'd scan the string twice. auto [read, utf8Length] = findBestFitUtf16(data, length, bufferSize); if (read == length) { - // Everything fit: convert all + // Everything fit: convert entire string with SIMD size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), From 575373e8a74e77c7f6f0468877fbb86a4850c053 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 14 Nov 2025 15:46:33 -0500 Subject: [PATCH 18/29] add more comments --- src/workerd/api/encoding.c++ | 76 +++++++++++++++++++++++++++++------- src/workerd/jsg/jsg.h | 8 ++++ src/workerd/jsg/jsvalue.h | 6 +++ 3 files changed, 76 insertions(+), 14 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index de0fecb144b..4ab9eb1a121 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -882,10 +882,37 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( // Latin-1 path: characters 0x00-0x7F encode as 1 UTF-8 byte, 0x80-0xFF as 2 bytes auto data = reinterpret_cast(view.data8()); - // Optimize for incremental encoding: if buffer is much smaller than input, - // skip all "whole string fits" checks and go straight to forward scan + // Latin-1 encoding strategy: three zones based on input size vs buffer capacity + // + // For Latin-1: ASCII chars (0x00-0x7F) → 1 byte, extended chars (0x80-0xFF) → 2 bytes + // Worst-case expansion: 2x, Best-case: 1x (pure ASCII), Typical mixed: ~1.2-1.5x + // + // Zone 1: "Definitely doesn't fit" (length > bufferSize * 2) + // Even if all ASCII (best case 1:1), string won't fit. Go straight to incremental mode. + // Uses forward scan without length calculation for maximum efficiency. + // Example: 1M chars, 400k buffer → can't possibly fit, scan to find cutoff point + // + // Zone 2: "Definitely fits" (length * 2 <= bufferSize) + // Even if all extended Latin-1 (worst case 1:2), string will fit. Convert directly. + // Example: 100k chars, 250k buffer → worst case 200k bytes, guaranteed to fit + // + // Zone 3: "Maybe fits" (bufferSize < length * 2 AND length <= bufferSize * 2) + // Might fit depending on ASCII/extended ratio. Use forward scan with length calculation. + // Avoids redundant work: scanning once gets us both position and UTF-8 length. + // Example: 600k chars, 700k buffer → fits if mostly ASCII, doesn't if mixed + // + // Threshold selection (bufferSize * 2): + // - Chosen based on worst-case Latin-1 expansion of 2x + // - Optimized for common case: small buffer relative to input (SSR, streaming) + // - Trade-off: Zone 3 still does forward scan, but with length calculation overhead + // - Performance cliff exists for borderline cases (e.g., 1M chars, 500k buffer falls + // into Zone 3), but forward scan with length is still reasonably efficient + // + // Future optimization: Could use sampling to estimate ASCII ratio and choose zone + // dynamically, but adds complexity for marginal benefit in typical workloads. + if (length > bufferSize * 2) { - // Incremental mode: forward scan to find what fits, then convert + // Zone 1: Incremental mode - forward scan to find what fits, then convert size_t read = findBestFitLatin1(data, length, bufferSize); size_t written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ @@ -894,8 +921,8 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // Fast path: Worst-case (2x) definitely fits if (length * 2 <= bufferSize) { + // Zone 2: Fast path - worst-case (2x) definitely fits, convert directly size_t written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), @@ -903,10 +930,7 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // "Maybe fits" zone: bufferSize < length*2, but might still fit entirely. - // Use forward scan with ReturnLength=true to get both position and UTF-8 length. - // This avoids redundant work: if we called utf8_length_from_latin1() to check if it fits, - // then called findBestFitLatin1() when it doesn't, we'd scan the string twice. + // Zone 3: "Maybe fits" - use forward scan with length calculation to avoid double-scan auto [read, utf8Length] = findBestFitLatin1(data, length, bufferSize); // Check if everything fit @@ -940,9 +964,36 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( if (simdutf::validate_utf16(data, length)) { // Valid UTF-16: use fast SIMD conversion + // + // UTF-16 to UTF-8 encoding: variable expansion based on code point ranges + // U+0000-U+007F (ASCII): 1 byte (rare in two-byte strings) + // U+0080-U+07FF: 2 bytes (most common) + // U+0800-U+FFFF (BMP): 3 bytes (common: CJK, etc.) + // U+10000-U+10FFFF (surrogate pairs): 4 bytes (less common: emoji, etc.) + // Worst-case: 3 bytes per code unit (BMP chars), Typical: ~2-3 bytes per code unit + // + // Zone 1: "Definitely doesn't fit" (length > bufferSize) + // Conservative threshold: even if all ASCII (impossible for two-byte strings), won't fit. + // This differs from Latin-1 (bufferSize * 2) due to different typical expansion patterns. + // Example: 1M code units, 900k buffer → can't fit, use incremental mode + // + // Zone 2: "Definitely fits" (length * 3 <= bufferSize) + // Even if all BMP characters (worst case 1:3), string will fit. Convert directly. + // Example: 200k code units, 700k buffer → worst case 600k bytes, guaranteed to fit + // + // Zone 3: "Maybe fits" (bufferSize < length * 3 AND length <= bufferSize) + // Might fit depending on character distribution. Use forward scan with length calculation. + // Example: 300k code units, 800k buffer → fits if mostly 2-byte chars, doesn't if BMP + // + // Threshold selection (bufferSize vs bufferSize * 3): + // - Zone 1 threshold (length > bufferSize) is conservative: even 1:1 ratio won't fit + // - More aggressive than Latin-1 because UTF-16 typical expansion is higher (~2-3x) + // - Zone 3 (maybe fits) is large: from bufferSize to bufferSize * 3 + // - Optimized for common case where UTF-16 strings are mostly 2-3 byte encodings + // - Performance cliff: Zone 3 still uses forward scan with length calculation overhead - // Incremental mode: buffer much smaller than input, skip "whole string fits" checks if (length > bufferSize) { + // Zone 1: Incremental mode - forward scan to find what fits, then convert size_t read = findBestFitUtf16(data, length, bufferSize); size_t written = simdutf::convert_utf16_to_utf8(data, read, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ @@ -951,8 +1002,8 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // Fast path: worst-case (3 bytes per UTF-16 code unit) fits if (length * 3 <= bufferSize) { + // Zone 2: Fast path - worst-case (3x) definitely fits, convert directly size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), @@ -960,10 +1011,7 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( }; } - // "Maybe fits" zone: bufferSize < length*3, but might still fit entirely. - // Use forward scan with ReturnLength=true to get both position and UTF-8 length. - // This avoids redundant work: if we called utf8_length_from_utf16() to check if it fits, - // then called findBestFitUtf16() when it doesn't, we'd scan the string twice. + // Zone 3: "Maybe fits" - use forward scan with length calculation to avoid double-scan auto [read, utf8Length] = findBestFitUtf16(data, length, bufferSize); if (read == length) { diff --git a/src/workerd/jsg/jsg.h b/src/workerd/jsg/jsg.h index cab79368322..0f41ea9ea63 100644 --- a/src/workerd/jsg/jsg.h +++ b/src/workerd/jsg/jsg.h @@ -2757,6 +2757,14 @@ class Lock { // Utility method to safely allocate a v8::BackingStore with allocation failure handling. // Throws a javascript error if allocation fails. + // + // IMPORTANT: This method can trigger garbage collection, which may move or invalidate V8 + // objects. Do NOT call this method while: + // - A v8::String::ValueView is alive (it holds internal V8 heap locks) + // - You have raw pointers to V8 heap data (e.g., from view.data8(), view.data16()) + // + // Safe pattern: Copy V8 string data to off-heap memory FIRST (e.g., via JsString::writeInto() + // into kj::SmallArray), THEN call allocBackingStore(). See TextEncoder::encode() for example. std::unique_ptr allocBackingStore( size_t size, AllocOption init_mode = AllocOption::ZERO_INITIALIZED) KJ_WARN_UNUSED_RESULT; diff --git a/src/workerd/jsg/jsvalue.h b/src/workerd/jsg/jsvalue.h index 2c9c1f55fde..8a52a5d5bd7 100644 --- a/src/workerd/jsg/jsvalue.h +++ b/src/workerd/jsg/jsvalue.h @@ -305,6 +305,12 @@ class JsString final: public JsBase { // The number of elements (e.g. char, byte, uint16_t) written to the buffer. size_t written; }; + + // Copy string contents into a provided buffer (off-heap memory). + // + // IMPORTANT: This method does NOT flatten the V8 string or hold V8 heap locks. It safely + // copies data out of V8's heap into your buffer. This makes it safe to use before calling + // GC-triggering operations like Lock::allocBackingStore(). WriteIntoStatus writeInto( Lock& js, kj::ArrayPtr buffer, WriteFlags options = WriteFlags::NONE) const; WriteIntoStatus writeInto( From 621e3cee2390e2e8e2b5102088dc63ab01fdd0fd Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 14 Nov 2025 15:51:08 -0500 Subject: [PATCH 19/29] simplify things --- src/workerd/api/encoding.c++ | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 4ab9eb1a121..28439ca98af 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -704,7 +704,7 @@ std::conditional_t, size_t> findBestFitL while (pos < length) { size_t remaining = length - pos; - size_t chunkSize = remaining < CHUNK ? remaining : CHUNK; + size_t chunkSize = kj::min(remaining, CHUNK); size_t chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize); if (utf8Accumulated + chunkUtf8Len > bufferSize) { From 4558093508a78d37e23d4f1b0589da69af73b4d3 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Mon, 17 Nov 2025 11:43:44 -0500 Subject: [PATCH 20/29] address pr reviews --- src/workerd/api/encoding.c++ | 62 +++++++++++++++--------------------- 1 file changed, 26 insertions(+), 36 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 28439ca98af..a4f9ea0c49b 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -492,13 +492,15 @@ kj::Maybe TextDecoder::decodePtr( namespace { -[[maybe_unused]] constexpr inline bool isLeadSurrogate(char16_t c) { +#ifdef KJ_DEBUG +constexpr inline bool isLeadSurrogate(char16_t c) { return (c & 0xFC00) == 0xD800; } -[[maybe_unused]] constexpr inline bool isTrailSurrogate(char16_t c) { +constexpr inline bool isTrailSurrogate(char16_t c) { return (c & 0xFC00) == 0xDC00; } +#endif // KJ_DEBUG // Calculate UTF-8 length from UTF-16 with potentially invalid surrogates. // Invalid surrogates are counted as U+FFFD (3 bytes in UTF-8). @@ -545,26 +547,6 @@ size_t utf8LengthFromInvalidUtf16(kj::ArrayPtr input) { return utf8Length; } -// Encode a single UTF-16 code unit to UTF-8 -inline size_t encodeUtf8CodeUnit(char16_t c, kj::ArrayPtr out) { - if (c < 0x80) { - KJ_DASSERT(out.size() >= 1); - out[0] = static_cast(c); - return 1; - } else if (c < 0x800) { - KJ_DASSERT(out.size() >= 2); - out[0] = static_cast(0xC0 | (c >> 6)); - out[1] = static_cast(0x80 | (c & 0x3F)); - return 2; - } else { - KJ_DASSERT(out.size() >= 3); - out[0] = static_cast(0xE0 | (c >> 12)); - out[1] = static_cast(0x80 | ((c >> 6) & 0x3F)); - out[2] = static_cast(0x80 | (c & 0x3F)); - return 3; - } -} - // Convert UTF-16 with potentially invalid surrogates to UTF-8. // Invalid surrogates are replaced with U+FFFD. // Returns the number of UTF-8 bytes written. @@ -597,14 +579,15 @@ size_t convertInvalidUtf16ToUtf8(kj::ArrayPtr input, kj::ArrayPt // Handle the invalid surrogate at inputPos // SURROGATE error means unpaired surrogate, so valid pair should be impossible - [[maybe_unused]] char16_t c = input[inputPos]; - KJ_DASSERT(!(isLeadSurrogate(c) && inputPos + 1 < input.size() && + KJ_DASSERT(!(isLeadSurrogate(input[inputPos]) && inputPos + 1 < input.size() && isTrailSurrogate(input[inputPos + 1])), "Valid surrogate pair should not trigger SURROGATE error"); - // Invalid surrogate - replace with U+FFFD (3 bytes) - outputPos += encodeUtf8CodeUnit(0xFFFD, out.slice(outputPos, out.size())); - KJ_DASSERT(outputPos <= out.size()); + // Invalid surrogate - replace with U+FFFD (3 bytes: 0xEF 0xBF 0xBD) + KJ_DASSERT(outputPos + 3 <= out.size()); + out[outputPos++] = static_cast(0xEF); + out[outputPos++] = static_cast(0xBF); + out[outputPos++] = static_cast(0xBD); inputPos++; } else { KJ_FAIL_REQUIRE( @@ -623,7 +606,12 @@ jsg::Ref TextEncoder::constructor(jsg::Lock& js) { jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { jsg::JsString str = input.orDefault(js.str()); - std::shared_ptr backingStore; + +#ifdef KJ_DEBUG + bool wasAlreadyFlat = str.isFlat(); + KJ_DEFER({ KJ_ASSERT(wasAlreadyFlat || !str.isFlat()); }); +#endif + size_t utf8_length = 0; auto length = str.length(js); @@ -643,22 +631,23 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::OptionalData(), latin1Buffer.begin(), length); - auto array = v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, length); + auto array = + v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, length); return jsg::JsUint8Array(array); } KJ_DASSERT(utf8_length > length); // Need to convert Latin-1 to UTF-8 - backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + auto backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); [[maybe_unused]] auto written = simdutf::convert_latin1_to_utf8(reinterpret_cast(latin1Buffer.begin()), length, reinterpret_cast(backingStore->Data())); KJ_DASSERT(utf8_length == written); - auto array = - v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, utf8_length); + auto array = v8::Uint8Array::New( + v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8_length); return jsg::JsUint8Array(array); } @@ -681,12 +670,12 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional(backingStore->Data())); auto array = - v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, utf8_length); + v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8_length); return jsg::JsUint8Array(array); } @@ -937,7 +926,8 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( if (read == length) { // ASCII fast path: utf8Length == length means all chars are ASCII, no conversion needed if (utf8Length == length) { - memcpy(outputBuf.begin(), data, length); + KJ_DASSERT(length <= bufferSize); + outputBuf.slice(0, length).copyFrom(kj::arrayPtr(data, length)); return TextEncoder::EncodeIntoResult{ .read = static_cast(length), .written = static_cast(length), From 95fe6424da0b892a5d024cf1d5fda72f27e0a196 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 18 Nov 2025 13:57:56 -0500 Subject: [PATCH 21/29] simplify implementation --- src/workerd/api/encoding.c++ | 56 ++++-------------------------------- 1 file changed, 6 insertions(+), 50 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index a4f9ea0c49b..01297bf4f41 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -502,51 +502,6 @@ constexpr inline bool isTrailSurrogate(char16_t c) { } #endif // KJ_DEBUG -// Calculate UTF-8 length from UTF-16 with potentially invalid surrogates. -// Invalid surrogates are counted as U+FFFD (3 bytes in UTF-8). -// Uses SIMD for valid portions and falls back to scalar for invalid surrogates. -size_t utf8LengthFromInvalidUtf16(kj::ArrayPtr input) { - size_t inputPos = 0; - size_t utf8Length = 0; - - while (inputPos < input.size()) { - // Find the next invalid surrogate using SIMD validation - auto result = - simdutf::validate_utf16_with_errors(input.begin() + inputPos, input.size() - inputPos); - - if (result.error == simdutf::error_code::SUCCESS) { - // Remaining input is valid - calculate length with SIMD - utf8Length += - simdutf::utf8_length_from_utf16(input.begin() + inputPos, input.size() - inputPos); - break; - } - - if (result.error == simdutf::error_code::SURROGATE) { - // Calculate length for the valid portion before the error with SIMD - if (result.count > 0) { - utf8Length += simdutf::utf8_length_from_utf16(input.begin() + inputPos, result.count); - inputPos += result.count; - } - - // Handle the invalid surrogate at inputPos - // SURROGATE error means unpaired surrogate, so valid pair should be impossible - [[maybe_unused]] char16_t c = input[inputPos]; - KJ_DASSERT(!(isLeadSurrogate(c) && inputPos + 1 < input.size() && - isTrailSurrogate(input[inputPos + 1])), - "Valid surrogate pair should not trigger SURROGATE error"); - - // Invalid surrogate = U+FFFD (3 bytes) - utf8Length += 3; - inputPos++; - } else { - KJ_FAIL_REQUIRE( - "Unexpected UTF-16 validation error from simdutf", static_cast(result.error)); - } - } - - return utf8Length; -} - // Convert UTF-16 with potentially invalid surrogates to UTF-8. // Invalid surrogates are replaced with U+FFFD. // Returns the number of UTF-8 bytes written. @@ -657,14 +612,14 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional utf16Buffer(length); // Note: writeInto() doesn't flatten the string - it calls writeTo() which chains through - // Write2 -> WriteV2 -> WriteHelperV2 -> String::WriteToFlat (written by Erik in 2008). + // Write2 -> WriteV2 -> WriteHelperV2 -> String::WriteToFlat. // This means we may read from multiple string segments, but that's fine for our use case. [[maybe_unused]] auto writeResult = str.writeInto(js, utf16Buffer.asPtr()); KJ_DASSERT( writeResult.written == length, "writeInto must completely overwrite the backing buffer"); auto data = reinterpret_cast(utf16Buffer.begin()); - utf8_length = utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)); + utf8_length = simdutf::utf8_length_from_utf16_with_replacement(data, length); if (!simdutf::validate_utf16(data, length)) { simdutf::to_well_formed_utf16(data, length, data); @@ -819,7 +774,7 @@ size_t findBestFitInvalidUtf16(const char16_t* data, size_t length, size_t buffe chunkSize = (remaining >= 2) ? 2 : remaining; } - size_t chunkUtf8Len = utf8LengthFromInvalidUtf16(kj::arrayPtr(data + pos, chunkSize)); + size_t chunkUtf8Len = simdutf::utf8_length_from_utf16_with_replacement(data + pos, chunkSize); if (utf8Accumulated + chunkUtf8Len > bufferSize) { // Chunk would overflow - binary search within chunk @@ -838,7 +793,8 @@ size_t findBestFitInvalidUtf16(const char16_t* data, size_t length, size_t buffe break; } - size_t midUtf8Length = utf8LengthFromInvalidUtf16(kj::arrayPtr(data + pos, adjustedMid)); + size_t midUtf8Length = + simdutf::utf8_length_from_utf16_with_replacement(data + pos, adjustedMid); if (utf8Accumulated + midUtf8Length <= bufferSize) { bestFit = adjustedMid; left = adjustedMid + 1; @@ -1043,7 +999,7 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( } // Slow path: calculate exact UTF-8 length - size_t utf8Length = utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)); + size_t utf8Length = simdutf::utf8_length_from_utf16_with_replacement(data, length); if (utf8Length <= bufferSize) { size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, length), outputBuf); return TextEncoder::EncodeIntoResult{ From 411a0559c763a5f97867397f1f97ce893f6a1c21 Mon Sep 17 00:00:00 2001 From: Erik Corry Date: Fri, 21 Nov 2025 20:08:00 +0100 Subject: [PATCH 22/29] An attempt to simplify the encodeInto change. (#5565) * Simplify * Tune slightly * Fix assert * Fix perf regression and OOM read * Handle very tiny output buffers * feedback --- src/workerd/api/encoding.c++ | 572 ++++++++--------------------------- 1 file changed, 128 insertions(+), 444 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 01297bf4f41..17af01cbff7 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -490,71 +490,6 @@ kj::Maybe TextDecoder::decodePtr( // ======================================================================================= // TextEncoder implementation -namespace { - -#ifdef KJ_DEBUG -constexpr inline bool isLeadSurrogate(char16_t c) { - return (c & 0xFC00) == 0xD800; -} - -constexpr inline bool isTrailSurrogate(char16_t c) { - return (c & 0xFC00) == 0xDC00; -} -#endif // KJ_DEBUG - -// Convert UTF-16 with potentially invalid surrogates to UTF-8. -// Invalid surrogates are replaced with U+FFFD. -// Returns the number of UTF-8 bytes written. -// Uses SIMD for valid portions and falls back to scalar for invalid surrogates. -size_t convertInvalidUtf16ToUtf8(kj::ArrayPtr input, kj::ArrayPtr out) { - size_t inputPos = 0; - size_t outputPos = 0; - - while (inputPos < input.size()) { - // Find the next invalid surrogate using SIMD validation - auto result = - simdutf::validate_utf16_with_errors(input.begin() + inputPos, input.size() - inputPos); - - if (result.error == simdutf::error_code::SUCCESS) { - // Remaining input is valid - convert it all with SIMD - outputPos += simdutf::convert_utf16_to_utf8( - input.begin() + inputPos, input.size() - inputPos, out.begin() + outputPos); - KJ_DASSERT(outputPos <= out.size()); - break; - } - - if (result.error == simdutf::error_code::SURROGATE) { - // Convert the valid portion before the error with SIMD - if (result.count > 0) { - outputPos += simdutf::convert_valid_utf16_to_utf8( - input.begin() + inputPos, result.count, out.begin() + outputPos); - KJ_DASSERT(outputPos <= out.size()); - inputPos += result.count; - } - - // Handle the invalid surrogate at inputPos - // SURROGATE error means unpaired surrogate, so valid pair should be impossible - KJ_DASSERT(!(isLeadSurrogate(input[inputPos]) && inputPos + 1 < input.size() && - isTrailSurrogate(input[inputPos + 1])), - "Valid surrogate pair should not trigger SURROGATE error"); - - // Invalid surrogate - replace with U+FFFD (3 bytes: 0xEF 0xBF 0xBD) - KJ_DASSERT(outputPos + 3 <= out.size()); - out[outputPos++] = static_cast(0xEF); - out[outputPos++] = static_cast(0xBF); - out[outputPos++] = static_cast(0xBD); - inputPos++; - } else { - KJ_FAIL_REQUIRE( - "Unexpected UTF-16 validation error from simdutf", static_cast(result.error)); - } - } - - return outputPos; -} - -} // namespace - jsg::Ref TextEncoder::constructor(jsg::Lock& js) { return js.alloc(); } @@ -562,15 +497,13 @@ jsg::Ref TextEncoder::constructor(jsg::Lock& js) { jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { jsg::JsString str = input.orDefault(js.str()); -#ifdef KJ_DEBUG - bool wasAlreadyFlat = str.isFlat(); - KJ_DEFER({ KJ_ASSERT(wasAlreadyFlat || !str.isFlat()); }); -#endif - size_t utf8_length = 0; auto length = str.length(js); - // Fast path: check if string is one-byte before creating ValueView + // Note: writeInto() doesn't flatten the string - it calls writeTo() which chains through + // Write2 -> WriteV2 -> WriteHelperV2 -> String::WriteToFlat. + // This means we may read from multiple string segments, but that's fine for our use case. + if (str.isOneByte(js)) { // Use off-heap allocation for intermediate Latin-1 buffer to avoid wasting V8 heap space // and potentially triggering GC. Stack allocation for small strings, heap for large. @@ -583,37 +516,26 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional(latin1Buffer.begin()), length); + auto backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); if (utf8_length == length) { // ASCII fast path: no conversion needed, Latin-1 is same as UTF-8 for ASCII - // Allocate final on-heap buffer and copy - auto backingStore = js.allocBackingStore(length, jsg::Lock::AllocOption::UNINITIALIZED); memcpy(backingStore->Data(), latin1Buffer.begin(), length); - auto array = - v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, length); - return jsg::JsUint8Array(array); + } else { + [[maybe_unused]] auto written = + simdutf::convert_latin1_to_utf8(reinterpret_cast(latin1Buffer.begin()), + length, reinterpret_cast(backingStore->Data())); + KJ_DASSERT(utf8_length == written); } - - KJ_DASSERT(utf8_length > length); - - // Need to convert Latin-1 to UTF-8 - auto backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); - [[maybe_unused]] auto written = - simdutf::convert_latin1_to_utf8(reinterpret_cast(latin1Buffer.begin()), length, - reinterpret_cast(backingStore->Data())); - KJ_DASSERT(utf8_length == written); auto array = v8::Uint8Array::New( v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8_length); return jsg::JsUint8Array(array); } - // Two-byte string path - // Use off-heap allocation for intermediate UTF-16 buffer to avoid triggering GC. + // Use off-heap allocation for intermediate UTF-16 buffer to avoid wasting V8 heap space + // and potentially triggering GC. Stack allocation for small strings, heap for large. // Stack allocation for small strings, heap for large. kj::SmallArray utf16Buffer(length); - // Note: writeInto() doesn't flatten the string - it calls writeTo() which chains through - // Write2 -> WriteV2 -> WriteHelperV2 -> String::WriteToFlat. - // This means we may read from multiple string segments, but that's fine for our use case. [[maybe_unused]] auto writeResult = str.writeInto(js, utf16Buffer.asPtr()); KJ_DASSERT( writeResult.written == length, "writeInto must completely overwrite the backing buffer"); @@ -621,14 +543,20 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional(utf16Buffer.begin()); utf8_length = simdutf::utf8_length_from_utf16_with_replacement(data, length); - if (!simdutf::validate_utf16(data, length)) { - simdutf::to_well_formed_utf16(data, length, data); - } - auto backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); - [[maybe_unused]] auto written = simdutf::convert_valid_utf16_to_utf8( + auto result = simdutf::convert_utf16_to_utf8_with_errors( data, length, reinterpret_cast(backingStore->Data())); + if (result.error != simdutf::SUCCESS) { + // Oh, no, there are unpaired surrogates. This is hopefully rare. + simdutf::to_well_formed_utf16(data, length, data); + [[maybe_unused]] auto written = + simdutf::convert_utf16_to_utf8(data, length, reinterpret_cast(backingStore->Data())); + KJ_DASSERT(written == utf8_length, "Conversion yielded wrong number of UTF-8 bytes"); + } else { + KJ_DASSERT(result.count == utf8_length, "Conversion yielded wrong number of UTF-8 bytes"); + } + auto array = v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8_length); return jsg::JsUint8Array(array); @@ -636,180 +564,93 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional -std::conditional_t, size_t> findBestFitLatin1( - const char* data, size_t length, size_t bufferSize) { - size_t pos = 0; - size_t utf8Accumulated = 0; - constexpr size_t CHUNK = 256; - - while (pos < length) { - size_t remaining = length - pos; - size_t chunkSize = kj::min(remaining, CHUNK); - size_t chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize); - - if (utf8Accumulated + chunkUtf8Len > bufferSize) { - // Chunk would overflow - binary search within chunk - size_t left = 0; - size_t right = chunkSize; - size_t bestFit = 0; - - while (left <= right) { - size_t mid = left + (right - left) / 2; - if (mid == 0) break; - - size_t midUtf8Length = simdutf::utf8_length_from_latin1(data + pos, mid); - if (utf8Accumulated + midUtf8Length <= bufferSize) { - bestFit = mid; - left = mid + 1; - } else { - right = mid - 1; - } - } - - if constexpr (ReturnLength) { - size_t finalPos = pos + bestFit; - size_t finalUtf8Len = - utf8Accumulated + simdutf::utf8_length_from_latin1(data + pos, bestFit); - return {finalPos, finalUtf8Len}; - } else { - return pos + bestFit; - } - } - - utf8Accumulated += chunkUtf8Len; - pos += chunkSize; - } +constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) { + // We would like to use simdutf::trim_partial_utf16, but it's not guaranteed + // to work right on invalid UTF-16. + return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00; +} - if constexpr (ReturnLength) { - return {pos, utf8Accumulated}; - } else { - return pos; - } +// Ignores surrogates conservatively. +constexpr size_t simpleUtfEncodingLength(uint16_t c) { + if (c < 0x80) return 1; + if (c < 0x400) return 2; + return 3; } -// Find how many UTF-16 code units fit when converted to UTF-8 -// Uses chunked forward scan with SIMD, O(result) complexity. Never splits surrogate pairs. -// Template parameter ReturnLength controls whether to return just position or (position, utf8_length) -template -std::conditional_t, size_t> findBestFitUtf16( - const char16_t* data, size_t length, size_t bufferSize) { +// Find how many UTF-16 or Latin1 code units fit when converted to UTF-8. +// May conservatively underestimate the largest number of code units we can fit +// because of undetected surrogate pairs on boundaries. +// Works even on malformed UTF-16. +template +size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { size_t pos = 0; size_t utf8Accumulated = 0; - constexpr size_t CHUNK = 256; - - while (pos < length) { - size_t remaining = length - pos; - size_t chunkSize = simdutf::trim_partial_utf16(data + pos, kj::min(remaining, CHUNK)); - - if (chunkSize == 0) { - chunkSize = (remaining >= 2) ? 2 : remaining; + // The SIMD is more efficient with a size that's a little over a multiple of 16. + constexpr size_t CHUNK = 257; + // The max number of UTF-8 output bytes per input code unit. + constexpr bool UTF16 = sizeof(Char) == 2; + constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2; + + // Our initial guess at how much the number of elements expands in the + // conversion to UTF-8. + double expansion = 1.15; + + while (pos < length && utf8Accumulated < bufferSize) { + size_t remainingInput = length - pos; + size_t spaceRemaining = bufferSize - utf8Accumulated; + KJ_DASSERT(expansion >= 1.15); + + // We estimate how many characters are likely to fit in the buffer, but + // only try for CHUNK characters at a time to minimize the worst case + // waste of time if we guessed too high. + size_t guaranteedToFit = spaceRemaining / MAX_FACTOR; + if (guaranteedToFit >= remainingInput) { + // Don't even bother checking any more, it's all going to fit. Hitting + // this halfway through is also a good reason to limit the CHUNK size. + return length; + } + size_t likelyToFit = kj::min(static_cast(spaceRemaining / expansion), CHUNK); + size_t fitEstimate = kj::max(1, kj::max(guaranteedToFit, likelyToFit)); + size_t chunkSize = kj::min(remainingInput, fitEstimate); + if (chunkSize == 1) break; // Not worth running this complicated stuff one char at a time. + // No div-by-zero because remainingInput and fitEstimate are at least 1. + KJ_DASSERT(chunkSize >= 1); + + size_t chunkUtf8Len; + if constexpr (UTF16) { + chunkUtf8Len = simdutf::utf8_length_from_utf16_with_replacement(data + pos, chunkSize); + } else { + chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize); } - - size_t chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize); if (utf8Accumulated + chunkUtf8Len > bufferSize) { - // Chunk would overflow - binary search within chunk - size_t left = 0; - size_t right = chunkSize; - size_t bestFit = 0; - - while (left <= right) { - size_t mid = left + (right - left) / 2; - if (mid == 0) break; - - size_t adjustedMid = simdutf::trim_partial_utf16(data + pos, mid); - - if (adjustedMid == 0) { - right = 0; - break; - } - - size_t midUtf8Length = simdutf::utf8_length_from_utf16(data + pos, adjustedMid); - if (utf8Accumulated + midUtf8Length <= bufferSize) { - bestFit = adjustedMid; - left = adjustedMid + 1; - } else { - right = adjustedMid - 1; - } - } - - if constexpr (ReturnLength) { - size_t finalPos = pos + bestFit; - size_t finalUtf8Len = - utf8Accumulated + simdutf::utf8_length_from_utf16(data + pos, bestFit); - return {finalPos, finalUtf8Len}; - } else { - return pos + bestFit; - } + // Our chosen chunk didn't fit in the rest of the output buffer. + KJ_DASSERT(chunkSize > guaranteedToFit); + // Since it didn't fit we adjust our expansion guess upwards. + expansion = kj::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize); + } else { + // Use successful length calculation to adjust our expansion estimate. + expansion = kj::max(1.15, (chunkUtf8Len * 1.1) / chunkSize); + pos += chunkSize; + utf8Accumulated += chunkUtf8Len; } - - utf8Accumulated += chunkUtf8Len; - pos += chunkSize; } - - if constexpr (ReturnLength) { - return {pos, utf8Accumulated}; - } else { - return pos; + // Do the last few code units in a simpler way. + while (pos < length && utf8Accumulated < bufferSize) { + size_t extra = simpleUtfEncodingLength(data[pos]); + if (utf8Accumulated + extra > bufferSize) break; + pos++; + utf8Accumulated += extra; } -} - -// Find how many UTF-16 code units with invalid surrogates fit when converted to UTF-8 -// Uses chunked forward scan with SIMD, O(result) complexity. Never splits surrogate pairs. -// Unpaired surrogates replaced with U+FFFD. -size_t findBestFitInvalidUtf16(const char16_t* data, size_t length, size_t bufferSize) { - size_t pos = 0; - size_t utf8Accumulated = 0; - constexpr size_t CHUNK = 256; - - while (pos < length) { - size_t remaining = length - pos; - size_t chunkSize = simdutf::trim_partial_utf16(data + pos, kj::min(remaining, CHUNK)); - - if (chunkSize == 0) { - chunkSize = (remaining >= 2) ? 2 : remaining; - } - - size_t chunkUtf8Len = simdutf::utf8_length_from_utf16_with_replacement(data + pos, chunkSize); - - if (utf8Accumulated + chunkUtf8Len > bufferSize) { - // Chunk would overflow - binary search within chunk - size_t left = 0; - size_t right = chunkSize; - size_t bestFit = 0; - - while (left <= right) { - size_t mid = left + (right - left) / 2; - if (mid == 0) break; - - size_t adjustedMid = simdutf::trim_partial_utf16(data + pos, mid); - - if (adjustedMid == 0) { - right = 0; - break; - } - - size_t midUtf8Length = - simdutf::utf8_length_from_utf16_with_replacement(data + pos, adjustedMid); - if (utf8Accumulated + midUtf8Length <= bufferSize) { - bestFit = adjustedMid; - left = adjustedMid + 1; - } else { - right = adjustedMid - 1; - } - } - - return pos + bestFit; + if (UTF16 && pos != 0 && pos != length && isSurrogatePair(data[pos - 1], data[pos])) { + // We ended on a leading surrogate which has a matching trailing surrogate in the next + // position. In order to make progress when the bufferSize is tiny we try to include it. + if (utf8Accumulated < bufferSize) { + pos++; // We had one more byte, so we can include the pair, UTF-8 encoding 3->4. + } else { + pos--; // Don't chop the pair in half. } - - utf8Accumulated += chunkUtf8Len; - pos += chunkSize; } - return pos; } @@ -820,197 +661,40 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( auto outputBuf = buffer.asArrayPtr(); size_t bufferSize = outputBuf.size(); - v8::String::ValueView view(js.v8Isolate, input); - uint32_t length = view.length(); - - if (view.is_one_byte()) { - // Latin-1 path: characters 0x00-0x7F encode as 1 UTF-8 byte, 0x80-0xFF as 2 bytes - auto data = reinterpret_cast(view.data8()); - - // Latin-1 encoding strategy: three zones based on input size vs buffer capacity - // - // For Latin-1: ASCII chars (0x00-0x7F) → 1 byte, extended chars (0x80-0xFF) → 2 bytes - // Worst-case expansion: 2x, Best-case: 1x (pure ASCII), Typical mixed: ~1.2-1.5x - // - // Zone 1: "Definitely doesn't fit" (length > bufferSize * 2) - // Even if all ASCII (best case 1:1), string won't fit. Go straight to incremental mode. - // Uses forward scan without length calculation for maximum efficiency. - // Example: 1M chars, 400k buffer → can't possibly fit, scan to find cutoff point - // - // Zone 2: "Definitely fits" (length * 2 <= bufferSize) - // Even if all extended Latin-1 (worst case 1:2), string will fit. Convert directly. - // Example: 100k chars, 250k buffer → worst case 200k bytes, guaranteed to fit - // - // Zone 3: "Maybe fits" (bufferSize < length * 2 AND length <= bufferSize * 2) - // Might fit depending on ASCII/extended ratio. Use forward scan with length calculation. - // Avoids redundant work: scanning once gets us both position and UTF-8 length. - // Example: 600k chars, 700k buffer → fits if mostly ASCII, doesn't if mixed - // - // Threshold selection (bufferSize * 2): - // - Chosen based on worst-case Latin-1 expansion of 2x - // - Optimized for common case: small buffer relative to input (SSR, streaming) - // - Trade-off: Zone 3 still does forward scan, but with length calculation overhead - // - Performance cliff exists for borderline cases (e.g., 1M chars, 500k buffer falls - // into Zone 3), but forward scan with length is still reasonably efficient - // - // Future optimization: Could use sampling to estimate ASCII ratio and choose zone - // dynamically, but adds complexity for marginal benefit in typical workloads. - - if (length > bufferSize * 2) { - // Zone 1: Incremental mode - forward scan to find what fits, then convert - size_t read = findBestFitLatin1(data, length, bufferSize); - size_t written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(read), - .written = static_cast(written), - }; - } - - if (length * 2 <= bufferSize) { - // Zone 2: Fast path - worst-case (2x) definitely fits, convert directly - size_t written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(length), - .written = static_cast(written), - }; - } - - // Zone 3: "Maybe fits" - use forward scan with length calculation to avoid double-scan - auto [read, utf8Length] = findBestFitLatin1(data, length, bufferSize); - - // Check if everything fit - if (read == length) { - // ASCII fast path: utf8Length == length means all chars are ASCII, no conversion needed - if (utf8Length == length) { - KJ_DASSERT(length <= bufferSize); - outputBuf.slice(0, length).copyFrom(kj::arrayPtr(data, length)); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(length), - .written = static_cast(length), - }; + size_t read = 0; + size_t written = 0; + { + // Scope for the view - we can't do anything that might cause a V8 GC! + v8::String::ValueView view(js.v8Isolate, input); + uint32_t length = view.length(); + + if (view.is_one_byte()) { + auto data = reinterpret_cast(view.data8()); + read = findBestFit(data, length, bufferSize); + if (read != 0) { + KJ_DASSERT(simdutf::utf8_length_from_latin1(data, read) <= bufferSize); + written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); + } + } else { + auto data = reinterpret_cast(view.data16()); + read = findBestFit(data, length, bufferSize); + if (read != 0) { + KJ_DASSERT(simdutf::utf8_length_from_utf16_with_replacement(data, read) <= bufferSize); + simdutf::result result = + simdutf::convert_utf16_to_utf8_with_errors(data, read, outputBuf.begin()); + if (result.error == simdutf::SUCCESS) { + written = result.count; + } else { + // Oh, no, there are unpaired surrogates. This is hopefully rare. + kj::SmallArray conversionBuffer(read); + simdutf::to_well_formed_utf16(data, read, conversionBuffer.begin()); + written = + simdutf::convert_utf16_to_utf8(conversionBuffer.begin(), read, outputBuf.begin()); + } } - - auto written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(length), - .written = static_cast(written), - }; - } - - // Partial fit: convert only what fits - size_t written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(read), - .written = static_cast(written), - }; - } - - // UTF-16 path: validate to ensure spec compliance (replace invalid surrogates with U+FFFD) - auto data = reinterpret_cast(view.data16()); - - if (simdutf::validate_utf16(data, length)) { - // Valid UTF-16: use fast SIMD conversion - // - // UTF-16 to UTF-8 encoding: variable expansion based on code point ranges - // U+0000-U+007F (ASCII): 1 byte (rare in two-byte strings) - // U+0080-U+07FF: 2 bytes (most common) - // U+0800-U+FFFF (BMP): 3 bytes (common: CJK, etc.) - // U+10000-U+10FFFF (surrogate pairs): 4 bytes (less common: emoji, etc.) - // Worst-case: 3 bytes per code unit (BMP chars), Typical: ~2-3 bytes per code unit - // - // Zone 1: "Definitely doesn't fit" (length > bufferSize) - // Conservative threshold: even if all ASCII (impossible for two-byte strings), won't fit. - // This differs from Latin-1 (bufferSize * 2) due to different typical expansion patterns. - // Example: 1M code units, 900k buffer → can't fit, use incremental mode - // - // Zone 2: "Definitely fits" (length * 3 <= bufferSize) - // Even if all BMP characters (worst case 1:3), string will fit. Convert directly. - // Example: 200k code units, 700k buffer → worst case 600k bytes, guaranteed to fit - // - // Zone 3: "Maybe fits" (bufferSize < length * 3 AND length <= bufferSize) - // Might fit depending on character distribution. Use forward scan with length calculation. - // Example: 300k code units, 800k buffer → fits if mostly 2-byte chars, doesn't if BMP - // - // Threshold selection (bufferSize vs bufferSize * 3): - // - Zone 1 threshold (length > bufferSize) is conservative: even 1:1 ratio won't fit - // - More aggressive than Latin-1 because UTF-16 typical expansion is higher (~2-3x) - // - Zone 3 (maybe fits) is large: from bufferSize to bufferSize * 3 - // - Optimized for common case where UTF-16 strings are mostly 2-3 byte encodings - // - Performance cliff: Zone 3 still uses forward scan with length calculation overhead - - if (length > bufferSize) { - // Zone 1: Incremental mode - forward scan to find what fits, then convert - size_t read = findBestFitUtf16(data, length, bufferSize); - size_t written = simdutf::convert_utf16_to_utf8(data, read, outputBuf.begin()); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(read), - .written = static_cast(written), - }; - } - - if (length * 3 <= bufferSize) { - // Zone 2: Fast path - worst-case (3x) definitely fits, convert directly - size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(length), - .written = static_cast(written), - }; - } - - // Zone 3: "Maybe fits" - use forward scan with length calculation to avoid double-scan - auto [read, utf8Length] = findBestFitUtf16(data, length, bufferSize); - - if (read == length) { - // Everything fit: convert entire string with SIMD - size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(length), - .written = static_cast(written), - }; } - - // Partial fit: convert only what fits - size_t written = simdutf::convert_utf16_to_utf8(data, read, outputBuf.begin()); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(read), - .written = static_cast(written), - }; } - - // Invalid UTF-16: convert directly to UTF-8, replacing unpaired surrogates with U+FFFD - - // Incremental mode: buffer much smaller than input, skip "whole string fits" checks - if (length > bufferSize) { - size_t read = findBestFitInvalidUtf16(data, length, bufferSize); - size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, read), outputBuf); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(read), - .written = static_cast(written), - }; - } - - // Fast path: worst-case (3 bytes per UTF-16 code unit) fits - if (length * 3 <= bufferSize) { - size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, length), outputBuf); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(length), - .written = static_cast(written), - }; - } - - // Slow path: calculate exact UTF-8 length - size_t utf8Length = simdutf::utf8_length_from_utf16_with_replacement(data, length); - if (utf8Length <= bufferSize) { - size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, length), outputBuf); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(length), - .written = static_cast(written), - }; - } - - // Doesn't fit: forward scan to find what does - size_t read = findBestFitInvalidUtf16(data, length, bufferSize); - size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, read), outputBuf); + KJ_DASSERT(written <= bufferSize); return TextEncoder::EncodeIntoResult{ .read = static_cast(read), .written = static_cast(written), From 62fb056042cb7b51d92f8e585ebfd149ae395686 Mon Sep 17 00:00:00 2001 From: Erik Corry Date: Mon, 24 Nov 2025 14:44:18 +0100 Subject: [PATCH 23/29] Add some tests of encodeinto for short output buffers. (#5570) --- src/workerd/api/BUILD.bazel | 7 +++ src/workerd/api/encoding-test.c++ | 77 +++++++++++++++++++++++++++++++ src/workerd/api/encoding.c++ | 23 +++++++-- src/workerd/api/encoding.h | 7 +++ 4 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 src/workerd/api/encoding-test.c++ diff --git a/src/workerd/api/BUILD.bazel b/src/workerd/api/BUILD.bazel index 0254d0994b9..4a1086ac3c1 100644 --- a/src/workerd/api/BUILD.bazel +++ b/src/workerd/api/BUILD.bazel @@ -588,6 +588,13 @@ kj_test( ], ) +kj_test( + src = "encoding-test.c++", + deps = [ + ":encoding", + ], +) + kj_test( src = "base64-test.c++", deps = ["//src/workerd/tests:test-fixture"], diff --git a/src/workerd/api/encoding-test.c++ b/src/workerd/api/encoding-test.c++ new file mode 100644 index 00000000000..a45d78aa563 --- /dev/null +++ b/src/workerd/api/encoding-test.c++ @@ -0,0 +1,77 @@ +// Copyright (c) 2025 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 + +#include "encoding.h" + +#include + +namespace workerd::api { +namespace test { + +KJ_TEST("BestFitASCII") { + // If there's zero input or output space, the answer is zero. + KJ_ASSERT(bestFit("", 0) == 0); + KJ_ASSERT(bestFit("a", 0) == 0); + KJ_ASSERT(bestFit("aa", 0) == 0); + KJ_ASSERT(bestFit("aaa", 0) == 0); + KJ_ASSERT(bestFit("aaaa", 0) == 0); + KJ_ASSERT(bestFit("aaaaa", 0) == 0); + KJ_ASSERT(bestFit("", 0) == 0); + KJ_ASSERT(bestFit("", 1) == 0); + KJ_ASSERT(bestFit("", 2) == 0); + KJ_ASSERT(bestFit("", 3) == 0); + KJ_ASSERT(bestFit("", 4) == 0); + KJ_ASSERT(bestFit("", 5) == 0); + // Zero cases with two-byte strings. + KJ_ASSERT(bestFit(u"", 0) == 0); + KJ_ASSERT(bestFit(u"€", 0) == 0); + KJ_ASSERT(bestFit(u"€€", 0) == 0); + KJ_ASSERT(bestFit(u"€€€", 0) == 0); + KJ_ASSERT(bestFit(u"€€€€", 0) == 0); + KJ_ASSERT(bestFit(u"€€€€€", 0) == 0); + KJ_ASSERT(bestFit(u"", 0) == 0); + KJ_ASSERT(bestFit(u"", 1) == 0); + KJ_ASSERT(bestFit(u"", 2) == 0); + KJ_ASSERT(bestFit(u"", 3) == 0); + KJ_ASSERT(bestFit(u"", 4) == 0); + KJ_ASSERT(bestFit(u"", 5) == 0); + // Small buffers that only just fit. + KJ_ASSERT(bestFit(u"a", 1) == 1); + KJ_ASSERT(bestFit(u"å", 2) == 1); + KJ_ASSERT(bestFit(u"€", 3) == 1); + KJ_ASSERT(bestFit(u"😹", 4) == 2); + // Small buffers that don't fit. + KJ_ASSERT(bestFit(u"å", 1) == 0); + KJ_ASSERT(bestFit(u"€", 2) == 0); + KJ_ASSERT(bestFit(u"😹", 3) == 0); + // Don't chop a surrogate pair. + KJ_ASSERT(bestFit(u"1😹", 4) == 1); + KJ_ASSERT(bestFit(u"12😹", 5) == 2); + KJ_ASSERT(bestFit(u"123😹", 6) == 3); + KJ_ASSERT(bestFit(u"1234😹", 7) == 4); + KJ_ASSERT(bestFit(u"12345😹", 8) == 5); + // Some bigger ones just for fun. + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 0) == 0); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 1) == 0); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 2) == 0); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 3) == 0); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 4) == 2); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 5) == 2); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 6) == 2); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 7) == 2); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 8) == 4); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 9) == 4); + KJ_ASSERT(bestFit(u"0😹😹😹😹😹😹", 9) == 5); // 0😹😹 is 5 and takes 9. + KJ_ASSERT(bestFit(u"01😹😹😹😹😹😹", 9) == 4); // 01😹 is 4 and takes 6. + KJ_ASSERT(bestFit(u"012😹😹😹😹😹😹", 9) == 5); // 012😹 is 5 and takes 7. + KJ_ASSERT(bestFit(u"0123😹😹😹😹😹😹", 9) == 6); // 0123😹 is 6 and takes 8. + KJ_ASSERT(bestFit(u"01234😹😹😹😹😹😹", 9) == 7); // 01234😹 is 7 and takes 9. + KJ_ASSERT(bestFit(u"012345😹😹😹😹😹😹", 9) == 6); // 012345 is 6 and takes 6. + KJ_ASSERT(bestFit(u"0123456😹😹😹😹😹😹", 9) == 7); // 0123456 is 7 and takes 7. + KJ_ASSERT(bestFit(u"01234567😹😹😹😹😹😹", 9) == 8); // 0123456 is 8 and takes 8. + KJ_ASSERT(bestFit(u"012345678😹😹😹😹😹😹", 9) == 9); // 0123456 is 9 and takes 9. +} + +} // namespace test +} // namespace workerd::api diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 17af01cbff7..8b0748ea63b 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -277,6 +277,9 @@ Encoding getEncodingForLabel(kj::StringPtr label) { #undef V return Encoding::INVALID; } + +constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096; + } // namespace const kj::Array TextDecoder::EMPTY = @@ -507,7 +510,7 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional latin1Buffer(length); + kj::SmallArray latin1Buffer(length); [[maybe_unused]] auto writeResult = str.writeInto(js, latin1Buffer.asPtr()); KJ_DASSERT( @@ -534,7 +537,7 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional utf16Buffer(length); + kj::SmallArray utf16Buffer(length); [[maybe_unused]] auto writeResult = str.writeInto(js, utf16Buffer.asPtr()); KJ_DASSERT( @@ -656,6 +659,20 @@ size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { } // namespace +namespace test { + +size_t bestFit(const char* str, size_t bufferSize) { + return findBestFit(str, strlen(str), bufferSize); +} + +size_t bestFit(const char16_t* str, size_t bufferSize) { + size_t length = 0; + while (str[length] != 0) length++; + return findBestFit(str, length, bufferSize); +} + +} // namespace test + TextEncoder::EncodeIntoResult TextEncoder::encodeInto( jsg::Lock& js, jsg::JsString input, jsg::JsUint8Array buffer) { auto outputBuf = buffer.asArrayPtr(); @@ -686,7 +703,7 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( written = result.count; } else { // Oh, no, there are unpaired surrogates. This is hopefully rare. - kj::SmallArray conversionBuffer(read); + kj::SmallArray conversionBuffer(read); simdutf::to_well_formed_utf16(data, read, conversionBuffer.begin()); written = simdutf::convert_utf16_to_utf8(conversionBuffer.begin(), read, outputBuf.begin()); diff --git a/src/workerd/api/encoding.h b/src/workerd/api/encoding.h index e694ad1b355..1325be8f245 100644 --- a/src/workerd/api/encoding.h +++ b/src/workerd/api/encoding.h @@ -245,4 +245,11 @@ class TextEncoder final: public jsg::Object { #define EW_ENCODING_ISOLATE_TYPES \ api::TextDecoder, api::TextEncoder, api::TextDecoder::ConstructorOptions, \ api::TextDecoder::DecodeOptions, api::TextEncoder::EncodeIntoResult + +namespace test { + +size_t bestFit(const char* str, size_t bufferSize); +size_t bestFit(const char16_t* str, size_t bufferSize); + +} // namespace test } // namespace workerd::api From 6956cb57f8453812417a0675120af0d9895dd4fc Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Mon, 24 Nov 2025 12:12:12 -0500 Subject: [PATCH 24/29] put changes behind an autogate --- src/workerd/api/encoding.c++ | 20 ++++++++++++++++++++ src/workerd/util/autogate.c++ | 2 ++ src/workerd/util/autogate.h | 2 ++ 3 files changed, 24 insertions(+) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 8b0748ea63b..d2e601d4929 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -498,6 +499,16 @@ jsg::Ref TextEncoder::constructor(jsg::Lock& js) { } jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { + if (!workerd::util::Autogate::isEnabled(workerd::util::AutogateKey::ENABLE_FAST_TEXTENCODER)) { + auto str = input.orDefault(js.str()); + auto view = JSG_REQUIRE_NONNULL(jsg::BufferSource::tryAlloc(js, str.utf8Length(js)), RangeError, + "Cannot allocate space for TextEncoder.encode"); + auto result = str.writeInto( + js, view.asArrayPtr().asChars(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); + KJ_DASSERT(result.written == view.size()); + return jsg::JsUint8Array(view.getHandle(js).As()); + } + jsg::JsString str = input.orDefault(js.str()); size_t utf8_length = 0; @@ -675,6 +686,15 @@ size_t bestFit(const char16_t* str, size_t bufferSize) { TextEncoder::EncodeIntoResult TextEncoder::encodeInto( jsg::Lock& js, jsg::JsString input, jsg::JsUint8Array buffer) { + if (!workerd::util::Autogate::isEnabled(workerd::util::AutogateKey::ENABLE_FAST_TEXTENCODER)) { + auto result = input.writeInto( + js, buffer.asArrayPtr(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(result.read), + .written = static_cast(result.written), + }; + } + auto outputBuf = buffer.asArrayPtr(); size_t bufferSize = outputBuf.size(); diff --git a/src/workerd/util/autogate.c++ b/src/workerd/util/autogate.c++ index 5a8b5a8d9dd..49c2869cdc8 100644 --- a/src/workerd/util/autogate.c++ +++ b/src/workerd/util/autogate.c++ @@ -35,6 +35,8 @@ kj::StringPtr KJ_STRINGIFY(AutogateKey key) { return "compression-stream-use-state-machine"_kj; case AutogateKey::IDENTITY_TRANSFORM_STREAM_USE_STATE_MACHINE: return "identity-transform-stream-use-state-machine"_kj; + case AutogateKey::ENABLE_FAST_TEXTENCODER: + return "enable-fast-textencoder"_kj; case AutogateKey::NumOfKeys: KJ_FAIL_ASSERT("NumOfKeys should not be used in getName"); } diff --git a/src/workerd/util/autogate.h b/src/workerd/util/autogate.h index 82a4af828e8..f41e074bd5d 100644 --- a/src/workerd/util/autogate.h +++ b/src/workerd/util/autogate.h @@ -30,6 +30,8 @@ enum class AutogateKey { COMPRESSION_STREAM_USE_STATE_MACHINE, // Switch the IdentityTransformStream to use the new state machine-based impl IDENTITY_TRANSFORM_STREAM_USE_STATE_MACHINE, + // Enable fast TextEncoder implementation using simdutf + ENABLE_FAST_TEXTENCODER, NumOfKeys // Reserved for iteration. }; From 166e9fd511c6822826f007e5012e9575d123d00d Mon Sep 17 00:00:00 2001 From: Erik Corry Date: Mon, 24 Nov 2025 21:55:19 +0100 Subject: [PATCH 25/29] Attempt to eliminate last regression (#5579) --- src/workerd/api/encoding.c++ | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index d2e601d4929..ef151cc1b95 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -703,14 +703,26 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( { // Scope for the view - we can't do anything that might cause a V8 GC! v8::String::ValueView view(js.v8Isolate, input); - uint32_t length = view.length(); + size_t length = view.length(); if (view.is_one_byte()) { auto data = reinterpret_cast(view.data8()); - read = findBestFit(data, length, bufferSize); - if (read != 0) { - KJ_DASSERT(simdutf::utf8_length_from_latin1(data, read) <= bufferSize); - written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); + simdutf::result result = + simdutf::validate_ascii_with_errors(data, kj::min(length, bufferSize)); + written = read = result.count; + auto outAddr = outputBuf.begin(); + memcpy(outAddr, data, read); + outAddr += read; + data += read; + length -= read; + bufferSize -= read; + if (length != 0 && bufferSize != 0) { + size_t rest = findBestFit(data, length, bufferSize); + if (rest != 0) { + KJ_DASSERT(simdutf::utf8_length_from_latin1(data, rest) <= bufferSize); + written += simdutf::convert_latin1_to_utf8(data, rest, outAddr); + read += rest; + } } } else { auto data = reinterpret_cast(view.data16()); @@ -732,9 +744,14 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( } } KJ_DASSERT(written <= bufferSize); + // V8's String::kMaxLenth is a lot less than a maximal int so this is fine. + using RInt = decltype(TextEncoder::EncodeIntoResult::read); + using WInt = decltype(TextEncoder::EncodeIntoResult::written); + KJ_DASSERT(0 <= read && read <= std::numeric_limits::max()); + KJ_DASSERT(0 <= written && written <= std::numeric_limits::max()); return TextEncoder::EncodeIntoResult{ - .read = static_cast(read), - .written = static_cast(written), + .read = static_cast(read), + .written = static_cast(written), }; } From d462ca1042beee6c83ae43d8ceeced658bc10621 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Mon, 29 Dec 2025 18:31:36 -0500 Subject: [PATCH 26/29] make changes due to simdutf --- src/workerd/api/BUILD.bazel | 3 ++- src/workerd/api/encoding.c++ | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/workerd/api/BUILD.bazel b/src/workerd/api/BUILD.bazel index 4a1086ac3c1..58838c42bff 100644 --- a/src/workerd/api/BUILD.bazel +++ b/src/workerd/api/BUILD.bazel @@ -434,7 +434,6 @@ wd_cc_library( srcs = ["encoding.c++"], hdrs = ["encoding.h"], implementation_deps = [ - "//src/workerd/io:features", "//src/workerd/util:strings", "@simdutf", ], @@ -442,6 +441,7 @@ wd_cc_library( deps = [ ":util", "//src/workerd/io:compatibility-date_capnp", + "//src/workerd/io:features", "//src/workerd/jsg", "@capnp-cpp//src/kj", "@simdutf", @@ -592,6 +592,7 @@ kj_test( src = "encoding-test.c++", deps = [ ":encoding", + "//src/workerd/io", ], ) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index ef151cc1b95..be487069e04 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -555,7 +555,7 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional(utf16Buffer.begin()); - utf8_length = simdutf::utf8_length_from_utf16_with_replacement(data, length); + utf8_length = simdutf::utf8_length_from_utf16_with_replacement(data, length).count; auto backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); auto result = simdutf::convert_utf16_to_utf8_with_errors( @@ -632,7 +632,7 @@ size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { size_t chunkUtf8Len; if constexpr (UTF16) { - chunkUtf8Len = simdutf::utf8_length_from_utf16_with_replacement(data + pos, chunkSize); + chunkUtf8Len = simdutf::utf8_length_from_utf16_with_replacement(data + pos, chunkSize).count; } else { chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize); } @@ -728,7 +728,8 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( auto data = reinterpret_cast(view.data16()); read = findBestFit(data, length, bufferSize); if (read != 0) { - KJ_DASSERT(simdutf::utf8_length_from_utf16_with_replacement(data, read) <= bufferSize); + KJ_DASSERT( + simdutf::utf8_length_from_utf16_with_replacement(data, read).count <= bufferSize); simdutf::result result = simdutf::convert_utf16_to_utf8_with_errors(data, read, outputBuf.begin()); if (result.error == simdutf::SUCCESS) { From e4e393da386f1f649440fac8b25dc5d19112c1cd Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 30 Dec 2025 10:59:07 -0500 Subject: [PATCH 27/29] leverage simdutf more (#5797) --- src/workerd/api/encoding.c++ | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index be487069e04..d4e73048b6f 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -555,22 +555,21 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional(utf16Buffer.begin()); - utf8_length = simdutf::utf8_length_from_utf16_with_replacement(data, length).count; + auto lengthResult = simdutf::utf8_length_from_utf16_with_replacement(data, length); + utf8_length = lengthResult.count; - auto backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); - auto result = simdutf::convert_utf16_to_utf8_with_errors( - data, length, reinterpret_cast(backingStore->Data())); - - if (result.error != simdutf::SUCCESS) { - // Oh, no, there are unpaired surrogates. This is hopefully rare. + if (lengthResult.error == simdutf::SURROGATE) { + // If there are surrogates there may be unpaired surrogates. Fix them. simdutf::to_well_formed_utf16(data, length, data); - [[maybe_unused]] auto written = - simdutf::convert_utf16_to_utf8(data, length, reinterpret_cast(backingStore->Data())); - KJ_DASSERT(written == utf8_length, "Conversion yielded wrong number of UTF-8 bytes"); } else { - KJ_DASSERT(result.count == utf8_length, "Conversion yielded wrong number of UTF-8 bytes"); + KJ_DASSERT(lengthResult.error == simdutf::SUCCESS); } + auto backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + [[maybe_unused]] auto written = + simdutf::convert_utf16_to_utf8(data, length, reinterpret_cast(backingStore->Data())); + KJ_DASSERT(written == utf8_length, "Conversion yielded wrong number of UTF-8 bytes"); + auto array = v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8_length); return jsg::JsUint8Array(array); From a03390fdb4a2655285f83c7affe9eee6253f4323 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 30 Dec 2025 11:15:10 -0500 Subject: [PATCH 28/29] fix build warning --- src/workerd/api/encoding.c++ | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index d4e73048b6f..12ae03bcb1b 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -503,7 +503,7 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional()); From 1a2eab7467c5443cd8973e863d0662d714751a0a Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 30 Dec 2025 12:20:45 -0500 Subject: [PATCH 29/29] address pr reviews --- src/workerd/api/encoding-test.c++ | 13 +++++++++++++ src/workerd/api/encoding.c++ | 25 +++++++++++++------------ src/workerd/api/streams/encoding.c++ | 4 +--- src/workerd/jsg/jsvalue.h | 6 ++++++ 4 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/workerd/api/encoding-test.c++ b/src/workerd/api/encoding-test.c++ index a45d78aa563..d74374092b4 100644 --- a/src/workerd/api/encoding-test.c++ +++ b/src/workerd/api/encoding-test.c++ @@ -9,6 +9,19 @@ namespace workerd::api { namespace test { +// These tests verify the findBestFit() function used by TextEncoder.encodeInto(). +// +// bestFit(input, bufferSize) returns the number of input code units that can be +// fully converted to UTF-8 and fit within the given output buffer size in bytes. +// +// The key insight is that different characters expand to different UTF-8 byte lengths: +// - ASCII (U+0000-U+007F): 1 byte per code unit +// - Latin-1 extended (U+0080-U+00FF): 2 bytes per code unit +// - BMP characters (U+0100-U+FFFF): 2-3 bytes per code unit +// - Supplementary characters (U+10000+): 4 bytes, encoded as surrogate pairs in UTF-16 +// +// The function must never split a surrogate pair, so if there's only room for part of +// a multi-byte character, it stops before that character. KJ_TEST("BestFitASCII") { // If there's zero input or output space, the answer is zero. KJ_ASSERT(bestFit("", 0) == 0); diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 12ae03bcb1b..d2cb4a8a949 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -514,6 +514,11 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional WriteV2 -> WriteHelperV2 -> String::WriteToFlat. // This means we may read from multiple string segments, but that's fine for our use case. @@ -533,16 +538,14 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::OptionalData(), latin1Buffer.begin(), length); + kj::arrayPtr(static_cast(backingStore->Data()), length).copyFrom(latin1Buffer); } else { [[maybe_unused]] auto written = simdutf::convert_latin1_to_utf8(reinterpret_cast(latin1Buffer.begin()), length, reinterpret_cast(backingStore->Data())); KJ_DASSERT(utf8_length == written); } - auto array = v8::Uint8Array::New( - v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8_length); - return jsg::JsUint8Array(array); + return jsg::JsUint8Array::create(js, kj::mv(backingStore), 0, utf8_length); } // Use off-heap allocation for intermediate UTF-16 buffer to avoid wasting V8 heap space @@ -570,24 +573,21 @@ jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional(backingStore->Data())); KJ_DASSERT(written == utf8_length, "Conversion yielded wrong number of UTF-8 bytes"); - auto array = - v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, utf8_length); - return jsg::JsUint8Array(array); + return jsg::JsUint8Array::create(js, kj::mv(backingStore), 0, utf8_length); } namespace { constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) { // We would like to use simdutf::trim_partial_utf16, but it's not guaranteed - // to work right on invalid UTF-16. + // to work right on invalid UTF-16. Hence, we need this method to check for + // surrogate pairs and correctly trim utf16 chunks. return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00; } // Ignores surrogates conservatively. constexpr size_t simpleUtfEncodingLength(uint16_t c) { - if (c < 0x80) return 1; - if (c < 0x400) return 2; - return 3; + return 1 + (c >= 0x80) + (c >= 0x400); } // Find how many UTF-16 or Latin1 code units fit when converted to UTF-8. @@ -669,6 +669,7 @@ size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { } // namespace +// Test helpers used by encoding-test.c++ to verify findBestFit behavior. namespace test { size_t bestFit(const char* str, size_t bufferSize) { @@ -710,7 +711,7 @@ TextEncoder::EncodeIntoResult TextEncoder::encodeInto( simdutf::validate_ascii_with_errors(data, kj::min(length, bufferSize)); written = read = result.count; auto outAddr = outputBuf.begin(); - memcpy(outAddr, data, read); + kj::arrayPtr(outAddr, read).copyFrom(kj::arrayPtr(data, read)); outAddr += read; data += read; length -= read; diff --git a/src/workerd/api/streams/encoding.c++ b/src/workerd/api/streams/encoding.c++ index 7fe67ce5e68..ede16250b83 100644 --- a/src/workerd/api/streams/encoding.c++ +++ b/src/workerd/api/streams/encoding.c++ @@ -93,9 +93,7 @@ jsg::Ref TextEncoderStream::constructor(jsg::Lock& js) { if (holder->pending != kj::none) { auto backingStore = js.allocBackingStore(3, jsg::Lock::AllocOption::UNINITIALIZED); memcpy(backingStore->Data(), REPLACEMENT_UTF8, 3); - auto array = - v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, 3); - controller->enqueue(js, jsg::JsUint8Array(array)); + controller->enqueue(js, jsg::JsUint8Array::create(js, kj::mv(backingStore), 0, 3)); } return js.resolvedPromise(); }; diff --git a/src/workerd/jsg/jsvalue.h b/src/workerd/jsg/jsvalue.h index 8a52a5d5bd7..25ee99fe228 100644 --- a/src/workerd/jsg/jsvalue.h +++ b/src/workerd/jsg/jsvalue.h @@ -254,6 +254,12 @@ class JsArrayBufferView final: public JsBase { public: + static JsUint8Array create( + Lock& js, std::unique_ptr backingStore, size_t byteOffset, size_t length) { + return JsUint8Array(v8::Uint8Array::New( + v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), byteOffset, length)); + } + template kj::ArrayPtr asArrayPtr() { v8::Local inner = *this;