From 0d2eacc71821cb5568b8237b69fb36c5c0f46128 Mon Sep 17 00:00:00 2001 From: David Hartunian Date: Thu, 12 Mar 2026 15:44:09 -0400 Subject: [PATCH 1/2] markers: add tests and benchmarks for StripMarkers and EscapeMarkers Add exhaustive table-driven tests for StripMarkers (both string and byte variants) and EscapeMarkers covering edge cases including empty inputs, unicode content, consecutive markers, hash prefix markers, and boundary conditions. Add benchmarks to establish a performance baseline before optimizing these functions. Co-Authored-By: roachdev-claude --- internal/markers/strip_test.go | 312 +++++++++++++++++++++++++++++++++ 1 file changed, 312 insertions(+) create mode 100644 internal/markers/strip_test.go diff --git a/internal/markers/strip_test.go b/internal/markers/strip_test.go new file mode 100644 index 0000000..c425910 --- /dev/null +++ b/internal/markers/strip_test.go @@ -0,0 +1,312 @@ +// Copyright 2026 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package markers + +import ( + "strings" + "testing" +) + +func TestStripMarkers(t *testing.T) { + testCases := []struct { + name string + input string + expected string + }{ + // --- Empty and no-marker inputs --- + { + name: "empty string", + input: "", + expected: "", + }, + { + name: "single character", + input: "x", + expected: "x", + }, + { + name: "plain text no markers", + input: "hello world", + expected: "hello world", + }, + { + name: "long plain text", + input: strings.Repeat("abcdefghij", 100), + expected: strings.Repeat("abcdefghij", 100), + }, + + // --- Single marker characters --- + { + name: "lone start marker", + input: StartS, + expected: "", + }, + { + name: "lone end marker", + input: EndS, + expected: "", + }, + { + name: "lone hash prefix", + input: HashPrefixS, + expected: "", + }, + + // --- Markers around text --- + { + name: "start and end around text", + input: StartS + "secret" + EndS, + expected: "secret", + }, + { + name: "markers with leading safe text", + input: "safe " + StartS + "secret" + EndS, + expected: "safe secret", + }, + { + name: "markers with trailing safe text", + input: StartS + "secret" + EndS + " safe", + expected: "secret safe", + }, + { + name: "markers with surrounding safe text", + input: "before " + StartS + "secret" + EndS + " after", + expected: "before secret after", + }, + { + name: "empty marker contents", + input: StartS + EndS, + expected: "", + }, + + // --- Multiple markers --- + { + name: "two adjacent markers", + input: StartS + "a" + EndS + StartS + "b" + EndS, + expected: "ab", + }, + { + name: "two markers with safe text between", + input: StartS + "a" + EndS + " mid " + StartS + "b" + EndS, + expected: "a mid b", + }, + { + name: "many markers interleaved with safe text", + input: "a" + StartS + "1" + EndS + "b" + StartS + "2" + EndS + "c", + expected: "a1b2c", + }, + + // --- Hash prefix marker --- + { + name: "hash marker stripped", + input: StartS + HashPrefixS + "alice" + EndS, + expected: "alice", + }, + { + name: "hash prefix in middle of text", + input: "user=" + StartS + HashPrefixS + "alice" + EndS, + expected: "user=alice", + }, + { + name: "multiple hash prefixes", + input: StartS + HashPrefixS + "a" + EndS + " " + StartS + HashPrefixS + "b" + EndS, + expected: "a b", + }, + + // --- Redacted marker (‹×›) --- + { + name: "redacted marker stripped", + input: RedactedS, + expected: "×", + }, + { + name: "redacted marker with safe text", + input: "x " + RedactedS + " y", + expected: "x × y", + }, + + // --- Unicode content --- + { + name: "unicode content inside markers", + input: StartS + "日本語" + EndS, + expected: "日本語", + }, + { + name: "safe unicode text around markers", + input: "こんにちは " + StartS + "secret" + EndS + " 世界", + expected: "こんにちは secret 世界", + }, + + // --- Multiple marker characters in sequence --- + { + name: "consecutive start markers", + input: StartS + StartS + StartS, + expected: "", + }, + { + name: "consecutive end markers", + input: EndS + EndS + EndS, + expected: "", + }, + { + name: "mixed consecutive markers", + input: StartS + EndS + HashPrefixS + StartS + EndS, + expected: "", + }, + + // --- Markers at boundaries --- + { + name: "start marker at beginning", + input: StartS + "rest of string", + expected: "rest of string", + }, + { + name: "end marker at end", + input: "rest of string" + EndS, + expected: "rest of string", + }, + { + name: "all three marker types", + input: "a" + StartS + "b" + HashPrefixS + "c" + EndS + "d", + expected: "abcd", + }, + + // --- Newlines and whitespace --- + { + name: "markers around whitespace", + input: StartS + " \t\n " + EndS, + expected: " \t\n ", + }, + { + name: "newlines between markers", + input: StartS + "a" + EndS + "\n" + StartS + "b" + EndS, + expected: "a\nb", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Test RedactableString.StripMarkers(). + got := RedactableString(tc.input).StripMarkers() + if got != tc.expected { + t.Errorf("RedactableString(%q).StripMarkers()\n got: %q\n want: %q", + tc.input, got, tc.expected) + } + + // Test RedactableBytes.StripMarkers(). + gotBytes := RedactableBytes(tc.input).StripMarkers() + if string(gotBytes) != tc.expected { + t.Errorf("RedactableBytes(%q).StripMarkers()\n got: %q\n want: %q", + tc.input, string(gotBytes), tc.expected) + } + }) + } +} + +func TestEscapeMarkers(t *testing.T) { + testCases := []struct { + name string + input string + expected string + }{ + { + name: "empty string", + input: "", + expected: "", + }, + { + name: "no markers", + input: "hello world", + expected: "hello world", + }, + { + name: "start marker escaped", + input: "a" + StartS + "b", + expected: "a" + EscapeMarkS + "b", + }, + { + name: "end marker escaped", + input: "a" + EndS + "b", + expected: "a" + EscapeMarkS + "b", + }, + { + name: "hash prefix escaped", + input: "a" + HashPrefixS + "b", + expected: "a" + EscapeMarkS + "b", + }, + { + name: "all markers escaped", + input: StartS + EndS + HashPrefixS, + expected: EscapeMarkS + EscapeMarkS + EscapeMarkS, + }, + { + name: "markers in context", + input: "a " + StartS + " b " + EndS + " c", + expected: "a " + EscapeMarkS + " b " + EscapeMarkS + " c", + }, + { + name: "unicode with markers", + input: "日本" + StartS + "語", + expected: "日本" + EscapeMarkS + "語", + }, + { + name: "long string with markers", + input: strings.Repeat("abc", 50) + StartS + strings.Repeat("def", 50), + expected: strings.Repeat("abc", 50) + EscapeMarkS + strings.Repeat("def", 50), + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := EscapeMarkers([]byte(tc.input)) + if string(got) != tc.expected { + t.Errorf("EscapeMarkers(%q)\n got: %q\n want: %q", + tc.input, string(got), tc.expected) + } + }) + } +} + +func BenchmarkStripMarkers_String(b *testing.B) { + s := RedactableString("user=" + StartS + "alice" + EndS + " action=" + StartS + "login" + EndS + " status=ok") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = s.StripMarkers() + } +} + +func BenchmarkStripMarkers_Bytes(b *testing.B) { + s := RedactableBytes("user=" + StartS + "alice" + EndS + " action=" + StartS + "login" + EndS + " status=ok") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = s.StripMarkers() + } +} + +func BenchmarkStripMarkers_NoMarkers(b *testing.B) { + s := RedactableString("hello world this is a plain string with no markers at all") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = s.StripMarkers() + } +} + +func BenchmarkEscapeMarkers(b *testing.B) { + s := []byte("a " + StartS + " b " + EndS + " c " + HashPrefixS + " d") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = EscapeMarkers(s) + } +} From ccab926cf1d12976cfbc5d8daa13d74d8d7c3382 Mon Sep 17 00:00:00 2001 From: David Hartunian Date: Thu, 12 Mar 2026 15:53:53 -0400 Subject: [PATCH 2/2] markers: replace regexp with scanning in StripMarkers and EscapeMarkers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the regexp-based implementation of StripMarkers() and EscapeMarkers() with manual byte scanning using a shared stripMarkersBytes() function. The new implementation exploits the fact that all three marker characters (‹ U+2039, › U+203A, † U+2020) are 3-byte UTF-8 sequences sharing the prefix 0xE2 0x80. It scans for 0xE2 bytes and checks the subsequent two bytes to identify markers, then either removes them or replaces them with the escape character. Also removes the now-unused ReStripMarkers regexp and its regexp import. Benchmark results (Apple M1 Pro): name old time/op new time/op delta StripMarkers_String-10 738ns ± 2% 105ns ± 1% -85.83% (p=0.000 n=8) StripMarkers_Bytes-10 733ns ± 4% 58ns ± 1% -92.10% (p=0.000 n=8) StripMarkers_NoMarkers-10 660ns ± 1% 23ns ± 0% -96.53% (p=0.000 n=8) EscapeMarkers-10 377ns ± 2% 45ns ±18% -87.99% (p=0.000 n=8) name old alloc/op new alloc/op delta StripMarkers_String-10 185B ± 1% 144B ± 0% -22.16% (p=0.000 n=8) StripMarkers_Bytes-10 136B ± 0% 48B ± 0% -64.71% (p=0.000 n=8) StripMarkers_NoMarkers-10 144B ± 0% 0B ± 0% -100.00% (p=0.000 n=8) EscapeMarkers-10 40.0B ± 0% 24.0B ± 0% -40.00% (p=0.000 n=8) name old allocs/op new allocs/op delta StripMarkers_String-10 6.00 ± 0% 3.00 ± 0% -50.00% (p=0.000 n=8) StripMarkers_Bytes-10 5.00 ± 0% 1.00 ± 0% -80.00% (p=0.000 n=8) StripMarkers_NoMarkers-10 3.00 ± 0% 0.00 ± 0% -100.00% (p=0.000 n=8) EscapeMarkers-10 3.00 ± 0% 1.00 ± 0% -66.67% (p=0.000 n=8) Co-Authored-By: roachdev-claude --- internal/markers/constants.go | 3 -- internal/markers/markers.go | 65 ++++++++++++++++++++++++++++++++-- internal/markers/strip_test.go | 47 ++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 6 deletions(-) diff --git a/internal/markers/constants.go b/internal/markers/constants.go index 0eee4b5..b51a1fc 100644 --- a/internal/markers/constants.go +++ b/internal/markers/constants.go @@ -14,8 +14,6 @@ package markers -import "regexp" - // Internal constants. const ( Start = '‹' @@ -38,5 +36,4 @@ var ( EscapeMarkBytes = []byte(EscapeMarkS) RedactedBytes = []byte(RedactedS) HashPrefixBytes = []byte(HashPrefixS) - ReStripMarkers = regexp.MustCompile("[" + StartS + EndS + HashPrefixS + "]") ) diff --git a/internal/markers/markers.go b/internal/markers/markers.go index a3e7350..53e1668 100644 --- a/internal/markers/markers.go +++ b/internal/markers/markers.go @@ -35,7 +35,14 @@ type RedactableString string // RedactableString. This returns an unsafe string where all safe and // unsafe bits are mixed together. func (s RedactableString) StripMarkers() string { - return ReStripMarkers.ReplaceAllString(string(s), "") + // Avoid the []byte conversion when no markers are present. + // All marker characters share the same leading UTF-8 byte (0xE2). + // This may false-positive on other Unicode characters (€, —, etc.), + // but stripMarkersBytes will correctly leave them untouched. + if strings.IndexByte(string(s), StartBytes[0]) == -1 { + return string(s) + } + return string(stripMarkersBytes([]byte(s), nil)) } // Redact replaces all occurrences of unsafe substrings by the @@ -72,7 +79,7 @@ type RedactableBytes []byte // RedactableBytes. This returns an unsafe string where all safe and // unsafe bits are mixed together. func (s RedactableBytes) StripMarkers() []byte { - return ReStripMarkers.ReplaceAll([]byte(s), nil) + return stripMarkersBytes([]byte(s), nil) } // Redact replaces all occurrences of unsafe substrings by the @@ -147,5 +154,57 @@ func RedactedMarker() []byte { return []byte(RedactedS) } // EscapeMarkers escapes the special delimiters from the provided // byte slice. func EscapeMarkers(s []byte) []byte { - return ReStripMarkers.ReplaceAll(s, EscapeMarkBytes) + return stripMarkersBytes(s, EscapeMarkBytes) +} + +// markerLen is the UTF-8 byte length of the marker characters. +// All marker characters (‹, ›, †) are 3-byte UTF-8 sequences sharing +// the same first two bytes. +const markerLen = 3 + +func init() { + // Verify that all marker characters share the same 2-byte UTF-8 prefix + // and are exactly 3 bytes long. + for _, m := range [][]byte{StartBytes, EndBytes, HashPrefixBytes} { + if len(m) != markerLen || m[0] != StartBytes[0] || m[1] != StartBytes[1] { + panic("marker characters must be 3-byte UTF-8 with shared prefix") + } + } +} + +// stripMarkersBytes scans data for marker characters (‹, ›, †) and either +// removes them (when replacement is nil) or replaces them with the +// replacement bytes. +func stripMarkersBytes(data []byte, replacement []byte) []byte { + lead := StartBytes[0] // first byte shared by all marker chars + // Fast path: no marker characters possible. + first := bytes.IndexByte(data, lead) + if first == -1 { + return data + } + + mid := StartBytes[1] // second byte shared by all marker chars + b2Start := StartBytes[2] + b2End := EndBytes[2] + b2Hash := HashPrefixBytes[2] + + buf := make([]byte, 0, len(data)) + pos := 0 + for i := first; i < len(data); { + if data[i] == lead && i+2 < len(data) && data[i+1] == mid { + if b := data[i+2]; b == b2Start || b == b2End || b == b2Hash { + buf = append(buf, data[pos:i]...) + buf = append(buf, replacement...) + i += markerLen + pos = i + continue + } + } + i++ + } + if pos == 0 { + return data + } + buf = append(buf, data[pos:]...) + return buf } diff --git a/internal/markers/strip_test.go b/internal/markers/strip_test.go index c425910..ecff110 100644 --- a/internal/markers/strip_test.go +++ b/internal/markers/strip_test.go @@ -149,6 +149,35 @@ func TestStripMarkers(t *testing.T) { expected: "こんにちは secret 世界", }, + // --- Non-marker Unicode sharing the 0xE2 lead byte --- + // These characters share the same first UTF-8 byte as our markers + // but must pass through unmodified. + { + name: "euro sign preserved", + input: "price: €100", + expected: "price: €100", + }, + { + name: "em dash preserved", + input: "hello — world", + expected: "hello — world", + }, + { + name: "smart quotes preserved", + input: "\u2018hello\u2019 \u201Cworld\u201D", + expected: "\u2018hello\u2019 \u201Cworld\u201D", + }, + { + name: "bullet and ellipsis preserved", + input: "• item one … more", + expected: "• item one … more", + }, + { + name: "non-marker unicode with markers", + input: "€" + StartS + "secret" + EndS + "—", + expected: "€secret—", + }, + // --- Multiple marker characters in sequence --- { name: "consecutive start markers", @@ -261,6 +290,16 @@ func TestEscapeMarkers(t *testing.T) { input: "日本" + StartS + "語", expected: "日本" + EscapeMarkS + "語", }, + { + name: "non-marker unicode preserved", + input: "€100 — Pro\u2019s \u201Cquote\u201D • item…", + expected: "€100 — Pro\u2019s \u201Cquote\u201D • item…", + }, + { + name: "non-marker unicode mixed with markers", + input: "€" + StartS + "—" + EndS + "•", + expected: "€" + EscapeMarkS + "—" + EscapeMarkS + "•", + }, { name: "long string with markers", input: strings.Repeat("abc", 50) + StartS + strings.Repeat("def", 50), @@ -303,6 +342,14 @@ func BenchmarkStripMarkers_NoMarkers(b *testing.B) { } } +func BenchmarkStripMarkers_NonMarkerUnicode(b *testing.B) { + s := RedactableString("price: €100 — Pro\u2019s \u201Cquote\u201D • item…") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = s.StripMarkers() + } +} + func BenchmarkEscapeMarkers(b *testing.B) { s := []byte("a " + StartS + " b " + EndS + " c " + HashPrefixS + " d") b.ResetTimer()