Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
cc39cf3
experiment with value view and simdutf
anonrig Oct 31, 2025
d93fcf7
address pr reviews
anonrig Oct 31, 2025
28b102d
address pr reviews
anonrig Oct 31, 2025
d6691fe
get rid of multiple valueviews
anonrig Nov 3, 2025
572ffa3
apply optimization to improve invalid utf16
anonrig Nov 3, 2025
6ce652b
add missing simdutf dependency
anonrig Nov 3, 2025
d980b42
apply review recommendations
anonrig Nov 4, 2025
c439565
optimize encodeInto
anonrig Nov 12, 2025
abef75c
optimize ASCII paths
anonrig Nov 12, 2025
de0de38
add fast path that avoids length calculation
anonrig Nov 12, 2025
06b0349
make the code reviewable
anonrig Nov 12, 2025
9e89282
address pr reviews
anonrig Nov 14, 2025
2bfb85a
more optimizations
anonrig Nov 14, 2025
022e1a2
make the code reviewable
anonrig Nov 14, 2025
6982921
use simdutf trim_partial_utf16
anonrig Nov 14, 2025
525cbac
avoid repetitive simdutf_length calls
anonrig Nov 14, 2025
538ed74
get rid of string flattening
anonrig Nov 14, 2025
575373e
add more comments
anonrig Nov 14, 2025
621e3ce
simplify things
anonrig Nov 14, 2025
4558093
address pr reviews
anonrig Nov 17, 2025
95fe642
simplify implementation
anonrig Nov 18, 2025
411a055
An attempt to simplify the encodeInto change. (#5565)
erikcorry Nov 21, 2025
62fb056
Add some tests of encodeinto for short output buffers. (#5570)
erikcorry Nov 24, 2025
6956cb5
put changes behind an autogate
anonrig Nov 24, 2025
166e9fd
Attempt to eliminate last regression (#5579)
erikcorry Nov 24, 2025
d462ca1
make changes due to simdutf
anonrig Dec 29, 2025
e4e393d
leverage simdutf more (#5797)
anonrig Dec 30, 2025
a03390f
fix build warning
anonrig Dec 30, 2025
1a2eab7
address pr reviews
anonrig Dec 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion src/workerd/api/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -434,13 +434,14 @@ wd_cc_library(
srcs = ["encoding.c++"],
hdrs = ["encoding.h"],
implementation_deps = [
"//src/workerd/io:features",
"//src/workerd/util:strings",
"@simdutf",
],
visibility = ["//visibility:public"],
deps = [
":util",
"//src/workerd/io:compatibility-date_capnp",
"//src/workerd/io:features",
"//src/workerd/jsg",
"@capnp-cpp//src/kj",
"@simdutf",
Expand Down Expand Up @@ -587,6 +588,14 @@ kj_test(
],
)

kj_test(
src = "encoding-test.c++",
deps = [
":encoding",
"//src/workerd/io",
],
)

kj_test(
src = "base64-test.c++",
deps = ["//src/workerd/tests:test-fixture"],
Expand Down
90 changes: 90 additions & 0 deletions src/workerd/api/encoding-test.c++
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright (c) 2025 Cloudflare, Inc.
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
// https://opensource.org/licenses/Apache-2.0

#include "encoding.h"

#include <kj/test.h>

namespace workerd::api {
namespace test {

// These tests verify the findBestFit() function used by TextEncoder.encodeInto().
//
// bestFit(input, bufferSize) returns the number of input code units that can be
// fully converted to UTF-8 and fit within the given output buffer size in bytes.
//
// The key insight is that different characters expand to different UTF-8 byte lengths:
// - ASCII (U+0000-U+007F): 1 byte per code unit
// - Latin-1 extended (U+0080-U+00FF): 2 bytes per code unit
// - BMP characters (U+0100-U+FFFF): 2-3 bytes per code unit
// - Supplementary characters (U+10000+): 4 bytes, encoded as surrogate pairs in UTF-16
//
// The function must never split a surrogate pair, so if there's only room for part of
// a multi-byte character, it stops before that character.
KJ_TEST("BestFitASCII") {
// If there's zero input or output space, the answer is zero.
KJ_ASSERT(bestFit("", 0) == 0);
KJ_ASSERT(bestFit("a", 0) == 0);
KJ_ASSERT(bestFit("aa", 0) == 0);
KJ_ASSERT(bestFit("aaa", 0) == 0);
KJ_ASSERT(bestFit("aaaa", 0) == 0);
KJ_ASSERT(bestFit("aaaaa", 0) == 0);
KJ_ASSERT(bestFit("", 0) == 0);
KJ_ASSERT(bestFit("", 1) == 0);
KJ_ASSERT(bestFit("", 2) == 0);
KJ_ASSERT(bestFit("", 3) == 0);
KJ_ASSERT(bestFit("", 4) == 0);
KJ_ASSERT(bestFit("", 5) == 0);
// Zero cases with two-byte strings.
KJ_ASSERT(bestFit(u"", 0) == 0);
KJ_ASSERT(bestFit(u"€", 0) == 0);
KJ_ASSERT(bestFit(u"€€", 0) == 0);
KJ_ASSERT(bestFit(u"€€€", 0) == 0);
KJ_ASSERT(bestFit(u"€€€€", 0) == 0);
KJ_ASSERT(bestFit(u"€€€€€", 0) == 0);
KJ_ASSERT(bestFit(u"", 0) == 0);
KJ_ASSERT(bestFit(u"", 1) == 0);
KJ_ASSERT(bestFit(u"", 2) == 0);
KJ_ASSERT(bestFit(u"", 3) == 0);
KJ_ASSERT(bestFit(u"", 4) == 0);
KJ_ASSERT(bestFit(u"", 5) == 0);
// Small buffers that only just fit.
KJ_ASSERT(bestFit(u"a", 1) == 1);
KJ_ASSERT(bestFit(u"å", 2) == 1);
KJ_ASSERT(bestFit(u"€", 3) == 1);
KJ_ASSERT(bestFit(u"😹", 4) == 2);
// Small buffers that don't fit.
KJ_ASSERT(bestFit(u"å", 1) == 0);
KJ_ASSERT(bestFit(u"€", 2) == 0);
KJ_ASSERT(bestFit(u"😹", 3) == 0);
// Don't chop a surrogate pair.
KJ_ASSERT(bestFit(u"1😹", 4) == 1);
KJ_ASSERT(bestFit(u"12😹", 5) == 2);
KJ_ASSERT(bestFit(u"123😹", 6) == 3);
KJ_ASSERT(bestFit(u"1234😹", 7) == 4);
KJ_ASSERT(bestFit(u"12345😹", 8) == 5);
// Some bigger ones just for fun.
KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 0) == 0);
KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 1) == 0);
KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 2) == 0);
KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 3) == 0);
KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 4) == 2);
KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 5) == 2);
KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 6) == 2);
KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 7) == 2);
KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 8) == 4);
KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 9) == 4);
KJ_ASSERT(bestFit(u"0😹😹😹😹😹😹", 9) == 5); // 0😹😹 is 5 and takes 9.
KJ_ASSERT(bestFit(u"01😹😹😹😹😹😹", 9) == 4); // 01😹 is 4 and takes 6.
KJ_ASSERT(bestFit(u"012😹😹😹😹😹😹", 9) == 5); // 012😹 is 5 and takes 7.
KJ_ASSERT(bestFit(u"0123😹😹😹😹😹😹", 9) == 6); // 0123😹 is 6 and takes 8.
KJ_ASSERT(bestFit(u"01234😹😹😹😹😹😹", 9) == 7); // 01234😹 is 7 and takes 9.
KJ_ASSERT(bestFit(u"012345😹😹😹😹😹😹", 9) == 6); // 012345 is 6 and takes 6.
KJ_ASSERT(bestFit(u"0123456😹😹😹😹😹😹", 9) == 7); // 0123456 is 7 and takes 7.
KJ_ASSERT(bestFit(u"01234567😹😹😹😹😹😹", 9) == 8); // 0123456 is 8 and takes 8.
KJ_ASSERT(bestFit(u"012345678😹😹😹😹😹😹", 9) == 9); // 0123456 is 9 and takes 9.
}

} // namespace test
} // namespace workerd::api
Loading
Loading