diff --git a/internal/markers/constants.go b/internal/markers/constants.go index 0eee4b5..b51a1fc 100644 --- a/internal/markers/constants.go +++ b/internal/markers/constants.go @@ -14,8 +14,6 @@ package markers -import "regexp" - // Internal constants. const ( Start = '‹' @@ -38,5 +36,4 @@ var ( EscapeMarkBytes = []byte(EscapeMarkS) RedactedBytes = []byte(RedactedS) HashPrefixBytes = []byte(HashPrefixS) - ReStripMarkers = regexp.MustCompile("[" + StartS + EndS + HashPrefixS + "]") ) diff --git a/internal/markers/markers.go b/internal/markers/markers.go index a3e7350..53e1668 100644 --- a/internal/markers/markers.go +++ b/internal/markers/markers.go @@ -35,7 +35,14 @@ type RedactableString string // RedactableString. This returns an unsafe string where all safe and // unsafe bits are mixed together. func (s RedactableString) StripMarkers() string { - return ReStripMarkers.ReplaceAllString(string(s), "") + // Avoid the []byte conversion when no markers are present. + // All marker characters share the same leading UTF-8 byte (0xE2). + // This may false-positive on other Unicode characters (€, —, etc.), + // but stripMarkersBytes will correctly leave them untouched. + if strings.IndexByte(string(s), StartBytes[0]) == -1 { + return string(s) + } + return string(stripMarkersBytes([]byte(s), nil)) } // Redact replaces all occurrences of unsafe substrings by the @@ -72,7 +79,7 @@ type RedactableBytes []byte // RedactableBytes. This returns an unsafe string where all safe and // unsafe bits are mixed together. func (s RedactableBytes) StripMarkers() []byte { - return ReStripMarkers.ReplaceAll([]byte(s), nil) + return stripMarkersBytes([]byte(s), nil) } // Redact replaces all occurrences of unsafe substrings by the @@ -147,5 +154,57 @@ func RedactedMarker() []byte { return []byte(RedactedS) } // EscapeMarkers escapes the special delimiters from the provided // byte slice. func EscapeMarkers(s []byte) []byte { - return ReStripMarkers.ReplaceAll(s, EscapeMarkBytes) + return stripMarkersBytes(s, EscapeMarkBytes) +} + +// markerLen is the UTF-8 byte length of the marker characters. +// All marker characters (‹, ›, †) are 3-byte UTF-8 sequences sharing +// the same first two bytes. +const markerLen = 3 + +func init() { + // Verify that all marker characters share the same 2-byte UTF-8 prefix + // and are exactly 3 bytes long. + for _, m := range [][]byte{StartBytes, EndBytes, HashPrefixBytes} { + if len(m) != markerLen || m[0] != StartBytes[0] || m[1] != StartBytes[1] { + panic("marker characters must be 3-byte UTF-8 with shared prefix") + } + } +} + +// stripMarkersBytes scans data for marker characters (‹, ›, †) and either +// removes them (when replacement is nil) or replaces them with the +// replacement bytes. +func stripMarkersBytes(data []byte, replacement []byte) []byte { + lead := StartBytes[0] // first byte shared by all marker chars + // Fast path: no marker characters possible. + first := bytes.IndexByte(data, lead) + if first == -1 { + return data + } + + mid := StartBytes[1] // second byte shared by all marker chars + b2Start := StartBytes[2] + b2End := EndBytes[2] + b2Hash := HashPrefixBytes[2] + + buf := make([]byte, 0, len(data)) + pos := 0 + for i := first; i < len(data); { + if data[i] == lead && i+2 < len(data) && data[i+1] == mid { + if b := data[i+2]; b == b2Start || b == b2End || b == b2Hash { + buf = append(buf, data[pos:i]...) + buf = append(buf, replacement...) + i += markerLen + pos = i + continue + } + } + i++ + } + if pos == 0 { + return data + } + buf = append(buf, data[pos:]...) + return buf } diff --git a/internal/markers/strip_test.go b/internal/markers/strip_test.go new file mode 100644 index 0000000..ecff110 --- /dev/null +++ b/internal/markers/strip_test.go @@ -0,0 +1,359 @@ +// Copyright 2026 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package markers + +import ( + "strings" + "testing" +) + +func TestStripMarkers(t *testing.T) { + testCases := []struct { + name string + input string + expected string + }{ + // --- Empty and no-marker inputs --- + { + name: "empty string", + input: "", + expected: "", + }, + { + name: "single character", + input: "x", + expected: "x", + }, + { + name: "plain text no markers", + input: "hello world", + expected: "hello world", + }, + { + name: "long plain text", + input: strings.Repeat("abcdefghij", 100), + expected: strings.Repeat("abcdefghij", 100), + }, + + // --- Single marker characters --- + { + name: "lone start marker", + input: StartS, + expected: "", + }, + { + name: "lone end marker", + input: EndS, + expected: "", + }, + { + name: "lone hash prefix", + input: HashPrefixS, + expected: "", + }, + + // --- Markers around text --- + { + name: "start and end around text", + input: StartS + "secret" + EndS, + expected: "secret", + }, + { + name: "markers with leading safe text", + input: "safe " + StartS + "secret" + EndS, + expected: "safe secret", + }, + { + name: "markers with trailing safe text", + input: StartS + "secret" + EndS + " safe", + expected: "secret safe", + }, + { + name: "markers with surrounding safe text", + input: "before " + StartS + "secret" + EndS + " after", + expected: "before secret after", + }, + { + name: "empty marker contents", + input: StartS + EndS, + expected: "", + }, + + // --- Multiple markers --- + { + name: "two adjacent markers", + input: StartS + "a" + EndS + StartS + "b" + EndS, + expected: "ab", + }, + { + name: "two markers with safe text between", + input: StartS + "a" + EndS + " mid " + StartS + "b" + EndS, + expected: "a mid b", + }, + { + name: "many markers interleaved with safe text", + input: "a" + StartS + "1" + EndS + "b" + StartS + "2" + EndS + "c", + expected: "a1b2c", + }, + + // --- Hash prefix marker --- + { + name: "hash marker stripped", + input: StartS + HashPrefixS + "alice" + EndS, + expected: "alice", + }, + { + name: "hash prefix in middle of text", + input: "user=" + StartS + HashPrefixS + "alice" + EndS, + expected: "user=alice", + }, + { + name: "multiple hash prefixes", + input: StartS + HashPrefixS + "a" + EndS + " " + StartS + HashPrefixS + "b" + EndS, + expected: "a b", + }, + + // --- Redacted marker (‹×›) --- + { + name: "redacted marker stripped", + input: RedactedS, + expected: "×", + }, + { + name: "redacted marker with safe text", + input: "x " + RedactedS + " y", + expected: "x × y", + }, + + // --- Unicode content --- + { + name: "unicode content inside markers", + input: StartS + "日本語" + EndS, + expected: "日本語", + }, + { + name: "safe unicode text around markers", + input: "こんにちは " + StartS + "secret" + EndS + " 世界", + expected: "こんにちは secret 世界", + }, + + // --- Non-marker Unicode sharing the 0xE2 lead byte --- + // These characters share the same first UTF-8 byte as our markers + // but must pass through unmodified. + { + name: "euro sign preserved", + input: "price: €100", + expected: "price: €100", + }, + { + name: "em dash preserved", + input: "hello — world", + expected: "hello — world", + }, + { + name: "smart quotes preserved", + input: "\u2018hello\u2019 \u201Cworld\u201D", + expected: "\u2018hello\u2019 \u201Cworld\u201D", + }, + { + name: "bullet and ellipsis preserved", + input: "• item one … more", + expected: "• item one … more", + }, + { + name: "non-marker unicode with markers", + input: "€" + StartS + "secret" + EndS + "—", + expected: "€secret—", + }, + + // --- Multiple marker characters in sequence --- + { + name: "consecutive start markers", + input: StartS + StartS + StartS, + expected: "", + }, + { + name: "consecutive end markers", + input: EndS + EndS + EndS, + expected: "", + }, + { + name: "mixed consecutive markers", + input: StartS + EndS + HashPrefixS + StartS + EndS, + expected: "", + }, + + // --- Markers at boundaries --- + { + name: "start marker at beginning", + input: StartS + "rest of string", + expected: "rest of string", + }, + { + name: "end marker at end", + input: "rest of string" + EndS, + expected: "rest of string", + }, + { + name: "all three marker types", + input: "a" + StartS + "b" + HashPrefixS + "c" + EndS + "d", + expected: "abcd", + }, + + // --- Newlines and whitespace --- + { + name: "markers around whitespace", + input: StartS + " \t\n " + EndS, + expected: " \t\n ", + }, + { + name: "newlines between markers", + input: StartS + "a" + EndS + "\n" + StartS + "b" + EndS, + expected: "a\nb", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Test RedactableString.StripMarkers(). + got := RedactableString(tc.input).StripMarkers() + if got != tc.expected { + t.Errorf("RedactableString(%q).StripMarkers()\n got: %q\n want: %q", + tc.input, got, tc.expected) + } + + // Test RedactableBytes.StripMarkers(). + gotBytes := RedactableBytes(tc.input).StripMarkers() + if string(gotBytes) != tc.expected { + t.Errorf("RedactableBytes(%q).StripMarkers()\n got: %q\n want: %q", + tc.input, string(gotBytes), tc.expected) + } + }) + } +} + +func TestEscapeMarkers(t *testing.T) { + testCases := []struct { + name string + input string + expected string + }{ + { + name: "empty string", + input: "", + expected: "", + }, + { + name: "no markers", + input: "hello world", + expected: "hello world", + }, + { + name: "start marker escaped", + input: "a" + StartS + "b", + expected: "a" + EscapeMarkS + "b", + }, + { + name: "end marker escaped", + input: "a" + EndS + "b", + expected: "a" + EscapeMarkS + "b", + }, + { + name: "hash prefix escaped", + input: "a" + HashPrefixS + "b", + expected: "a" + EscapeMarkS + "b", + }, + { + name: "all markers escaped", + input: StartS + EndS + HashPrefixS, + expected: EscapeMarkS + EscapeMarkS + EscapeMarkS, + }, + { + name: "markers in context", + input: "a " + StartS + " b " + EndS + " c", + expected: "a " + EscapeMarkS + " b " + EscapeMarkS + " c", + }, + { + name: "unicode with markers", + input: "日本" + StartS + "語", + expected: "日本" + EscapeMarkS + "語", + }, + { + name: "non-marker unicode preserved", + input: "€100 — Pro\u2019s \u201Cquote\u201D • item…", + expected: "€100 — Pro\u2019s \u201Cquote\u201D • item…", + }, + { + name: "non-marker unicode mixed with markers", + input: "€" + StartS + "—" + EndS + "•", + expected: "€" + EscapeMarkS + "—" + EscapeMarkS + "•", + }, + { + name: "long string with markers", + input: strings.Repeat("abc", 50) + StartS + strings.Repeat("def", 50), + expected: strings.Repeat("abc", 50) + EscapeMarkS + strings.Repeat("def", 50), + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := EscapeMarkers([]byte(tc.input)) + if string(got) != tc.expected { + t.Errorf("EscapeMarkers(%q)\n got: %q\n want: %q", + tc.input, string(got), tc.expected) + } + }) + } +} + +func BenchmarkStripMarkers_String(b *testing.B) { + s := RedactableString("user=" + StartS + "alice" + EndS + " action=" + StartS + "login" + EndS + " status=ok") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = s.StripMarkers() + } +} + +func BenchmarkStripMarkers_Bytes(b *testing.B) { + s := RedactableBytes("user=" + StartS + "alice" + EndS + " action=" + StartS + "login" + EndS + " status=ok") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = s.StripMarkers() + } +} + +func BenchmarkStripMarkers_NoMarkers(b *testing.B) { + s := RedactableString("hello world this is a plain string with no markers at all") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = s.StripMarkers() + } +} + +func BenchmarkStripMarkers_NonMarkerUnicode(b *testing.B) { + s := RedactableString("price: €100 — Pro\u2019s \u201Cquote\u201D • item…") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = s.StripMarkers() + } +} + +func BenchmarkEscapeMarkers(b *testing.B) { + s := []byte("a " + StartS + " b " + EndS + " c " + HashPrefixS + " d") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = EscapeMarkers(s) + } +}