From 813d7a3231814dd3c6ef4f31167b37cd34d8889a Mon Sep 17 00:00:00 2001 From: fangshuyu-768 Date: Mon, 20 Apr 2026 23:36:39 +0800 Subject: [PATCH 1/2] fix(doc): exclude code regions and escaped markers from docs +update checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the three review comments on #569: the blank-line paragraph check and the bold+italic emphasis check both operate on the raw markdown string, so fenced code blocks / inline code spans / literal escaped markers produce false-positive warnings on content users expect to pass through verbatim. Changes: - Add proseHasBlankLine(): fence-aware detector that returns true only when a blank line sits outside of ```...``` or ~~~...~~~ regions. Replaces the raw strings.Contains("\n\n") check in checkDocsUpdateReplaceMultilineMarkdown. - Add stripMarkdownCodeRegions(): blanks out fenced code lines and masks inline code spans (via scanInlineCodeSpans from markdown_fix.go) with equal-length whitespace so byte offsets outside the stripped regions are preserved. - Add stripEscapedEmphasisMarkers(): removes "\*" and "\_" so literal sequences like "\***text***" — which CommonMark renders as a literal asterisk plus bold — don't match the combined bold+italic regex. - Wire both helpers into checkDocsUpdateBoldItalic(): the regex now runs on stripEscapedEmphasisMarkers(stripMarkdownCodeRegions(markdown)), so code samples and escaped markers are sanitized away before detection. Shared fence-parsing helpers (codeFenceOpenMarker, isCodeFenceClose, leadingRun) are kept local to this file to avoid touching files outside the scope of the reviewed PR. If a future change wants to reuse them across the doc package, they can be promoted then. Tests: - TestCheckDocsUpdateReplaceMultilineMarkdown: add 4 negative/positive cases — blank line inside backtick and tilde fences (no flag), blank line in prose while fence also has blanks (flag wins), fenced code with no blank lines (no flag). - TestCheckDocsUpdateBoldItalic: add 9 cases — ***text*** / **_text_** / _**text**_ inside fenced code (backtick and tilde), inside inline code spans, and escaped \***text*** / \*\*_text_\*\* (none flagged); plus two positive cases to verify the strip doesn't over-sanitize (real emphasis in prose still fires when inline/fenced code is nearby). --- shortcuts/doc/docs_update_check.go | 159 ++++++++++++++++++++++-- shortcuts/doc/docs_update_check_test.go | 81 ++++++++++++ 2 files changed, 228 insertions(+), 12 deletions(-) diff --git a/shortcuts/doc/docs_update_check.go b/shortcuts/doc/docs_update_check.go index 273f3e07d..dba9eda0e 100644 --- a/shortcuts/doc/docs_update_check.go +++ b/shortcuts/doc/docs_update_check.go @@ -14,13 +14,18 @@ import ( // commonly surprise users; the update is still executed — callers // decide whether to stop at a warning. // +// Both checks ignore fenced code blocks, inline code spans, and +// backslash-escaped emphasis markers so that literal Markdown content +// embedded in code samples or escaped prose does not produce false +// positives. +// // Warnings emitted (current): // // 1. replace_* modes do not split blocks. A Markdown payload containing -// a blank line (\n\n) implies the caller expects multiple paragraphs, -// but replace_range / replace_all only swap in-block text. The -// resulting block will contain the blank line as literal text and -// appear as a single paragraph in the UI. +// a blank line (\n\n) in prose implies the caller expects multiple +// paragraphs, but replace_range / replace_all only swap in-block +// text. The resulting block will contain the blank line as literal +// text and appear as a single paragraph in the UI. // // 2. Lark does not round-trip bold+italic. Markdown like ***text*** or // **_text_** / _**text**_ is stored as only one of the two emphases @@ -38,16 +43,17 @@ func docsUpdateWarnings(mode, markdown string) []string { } // checkDocsUpdateReplaceMultilineMarkdown flags markdown that contains a -// blank-line paragraph break under a replace_* mode. Returns an empty -// string when the combination is fine. +// blank-line paragraph break outside fenced code blocks under a replace_* +// mode. Blank lines inside code fences are literal content and don't +// imply paragraph semantics, so they are deliberately ignored. func checkDocsUpdateReplaceMultilineMarkdown(mode, markdown string) string { if mode != "replace_range" && mode != "replace_all" { return "" } // A CR/LF-robust check: both "\n\n" and "\r\n\r\n" count as paragraph - // separators. We normalize line endings once before the substring match. + // separators. We normalize line endings once before detection. normalized := strings.ReplaceAll(markdown, "\r\n", "\n") - if !strings.Contains(normalized, "\n\n") { + if !proseHasBlankLine(normalized) { return "" } return "--mode=" + mode + " does not split a block into multiple paragraphs; " + @@ -67,17 +73,146 @@ var reBoldItalicUnderscoreInside = regexp.MustCompile(`\*\*_\S[^_*]*?\S_\*\*|\*\ var reBoldItalicUnderscoreOutside = regexp.MustCompile(`_\*\*\S[^_*]*?\S\*\*_|_\*\*\S\*\*_`) // checkDocsUpdateBoldItalic flags Markdown emphases that attempt to -// combine bold and italic in a way Lark cannot represent. +// combine bold and italic in a way Lark cannot represent. Fenced code +// blocks, inline code spans, and backslash-escaped emphasis markers are +// stripped first so that literal markdown examples ("here is a +// `***keyword***` to flag") do not trigger the warning. func checkDocsUpdateBoldItalic(markdown string) string { if markdown == "" { return "" } - if reBoldItalicTriple.MatchString(markdown) || - reBoldItalicUnderscoreInside.MatchString(markdown) || - reBoldItalicUnderscoreOutside.MatchString(markdown) { + sanitized := stripEscapedEmphasisMarkers(stripMarkdownCodeRegions(markdown)) + if reBoldItalicTriple.MatchString(sanitized) || + reBoldItalicUnderscoreInside.MatchString(sanitized) || + reBoldItalicUnderscoreOutside.MatchString(sanitized) { return "Lark does not support combined bold+italic markers (***text***, **_text_**, _**text**_); " + "the emphasis will be downgraded to either bold or italic. " + "Split into two separate emphases or drop one of them." } return "" } + +// proseHasBlankLine reports whether markdown contains a blank line outside +// of fenced code blocks. Blank lines inside ```...``` or ~~~...~~~ fences +// are code content, not paragraph separators, and must not trip the +// "replace_* cannot split paragraphs" warning. +// +// A blank line counts only when it sits between two non-blank boundaries +// (other prose, or a fence open/close). A trailing empty line at EOF is +// not treated as "\n\n". +func proseHasBlankLine(markdown string) bool { + lines := strings.Split(markdown, "\n") + inFence := false + var fenceMarker string + for i, line := range lines { + trimmed := strings.TrimSpace(line) + if inFence { + if isCodeFenceClose(trimmed, fenceMarker) { + inFence = false + fenceMarker = "" + } + continue + } + if marker := codeFenceOpenMarker(trimmed); marker != "" { + inFence = true + fenceMarker = marker + continue + } + if trimmed == "" && i > 0 && i+1 < len(lines) { + return true + } + } + return false +} + +// stripMarkdownCodeRegions returns markdown with fenced code blocks blanked +// out and inline code spans replaced by whitespace of equivalent length. +// Byte offsets outside the masked regions are preserved, so follow-on +// regex matches still point at real prose positions. +func stripMarkdownCodeRegions(markdown string) string { + lines := strings.Split(markdown, "\n") + inFence := false + var fenceMarker string + for i, line := range lines { + trimmed := strings.TrimSpace(line) + if inFence { + if isCodeFenceClose(trimmed, fenceMarker) { + inFence = false + fenceMarker = "" + } + lines[i] = "" + continue + } + if marker := codeFenceOpenMarker(trimmed); marker != "" { + inFence = true + fenceMarker = marker + lines[i] = "" + continue + } + lines[i] = maskInlineCodeSpans(line) + } + return strings.Join(lines, "\n") +} + +// maskInlineCodeSpans replaces the byte ranges of any inline code spans in +// line with space characters of equal length. Uses scanInlineCodeSpans from +// markdown_fix.go, which implements the CommonMark §6.1 matching-backtick-run +// rule (so “ `a`b` “ is a single span). +func maskInlineCodeSpans(line string) string { + spans := scanInlineCodeSpans(line) + if len(spans) == 0 { + return line + } + var sb strings.Builder + pos := 0 + for _, loc := range spans { + sb.WriteString(line[pos:loc[0]]) + sb.WriteString(strings.Repeat(" ", loc[1]-loc[0])) + pos = loc[1] + } + sb.WriteString(line[pos:]) + return sb.String() +} + +// stripEscapedEmphasisMarkers removes backslash-escaped '*' and '_' so the +// bold/italic regexes don't treat literal sequences like `\***text***` as +// real combined emphasis. CommonMark renders "\*" as a literal "*" with no +// emphasis semantics; dropping the escape + its target from the detection +// input keeps the heuristic aligned with what the renderer actually does. +func stripEscapedEmphasisMarkers(s string) string { + s = strings.ReplaceAll(s, `\*`, "") + s = strings.ReplaceAll(s, `\_`, "") + return s +} + +// codeFenceOpenMarker returns the exact fence marker (e.g. "```" or "~~~~") +// if trimmed opens a fenced code block, otherwise "". Supports any fence of +// length ≥ 3 per CommonMark §4.5. +func codeFenceOpenMarker(trimmed string) string { + switch { + case strings.HasPrefix(trimmed, "```"): + return leadingRun(trimmed, '`') + case strings.HasPrefix(trimmed, "~~~"): + return leadingRun(trimmed, '~') + } + return "" +} + +// isCodeFenceClose reports whether trimmed closes a fence opened with +// marker. Per CommonMark, the closer must use the same fence character, +// be at least as long as the opener, and contain no info-string text. +func isCodeFenceClose(trimmed, marker string) bool { + if marker == "" || !strings.HasPrefix(trimmed, marker) { + return false + } + return strings.TrimSpace(trimmed[len(marker):]) == "" +} + +// leadingRun returns the longest prefix of s made up of the byte c. +func leadingRun(s string, c byte) string { + i := 0 + for i < len(s) && s[i] == c { + i++ + } + return s[:i] +} diff --git a/shortcuts/doc/docs_update_check_test.go b/shortcuts/doc/docs_update_check_test.go index 39dbe5336..c71e7c313 100644 --- a/shortcuts/doc/docs_update_check_test.go +++ b/shortcuts/doc/docs_update_check_test.go @@ -65,6 +65,37 @@ func TestCheckDocsUpdateReplaceMultilineMarkdown(t *testing.T) { markdown: "", wantHint: false, }, + { + // The check must ignore blank lines inside fenced code; otherwise + // a user replacing one block with a legitimate code sample that + // contains blank lines would see a spurious warning. + name: "blank line inside backtick fenced code is not flagged", + mode: "replace_range", + markdown: "```\nline1\n\nline2\n```", + wantHint: false, + }, + { + name: "blank line inside tilde fenced code is not flagged", + mode: "replace_range", + markdown: "~~~\ncode line one\n\ncode line two\n~~~", + wantHint: false, + }, + { + // Mixed prose + fenced code: any blank line in prose still wins, + // even if the fenced content also contains blanks. + name: "blank line in prose outside fence still flags even when fence has blanks", + mode: "replace_range", + markdown: "first paragraph\n\nsecond paragraph\n\n```\ncode\n\nmore\n```", + wantHint: true, + }, + { + // Fenced code with no blank lines inside must not trip on the + // fence markers themselves. + name: "fenced code with no blank lines does not flag", + mode: "replace_range", + markdown: "prose before\n```go\nfmt.Println(\"hi\")\n```\nprose after", + wantHint: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -135,6 +166,56 @@ func TestCheckDocsUpdateBoldItalic(t *testing.T) { input: "", wantHint: false, }, + { + // The emphasis check must not fire on literal Markdown samples + // inside a fenced code block — the canonical use case is docs + // authors pasting tutorials that demonstrate these exact patterns. + name: "triple asterisks inside backtick fenced code is not flagged", + input: "example:\n```\nthe shape ***keyword*** downgrades\n```", + wantHint: false, + }, + { + name: "underscore-bold inside fenced code is not flagged", + input: "example:\n```markdown\nuse **_strong italic_** carefully\n```", + wantHint: false, + }, + { + name: "bold-underscore inside fenced code is not flagged", + input: "example:\n~~~\n_**outside-underscore**_ is a bad shape\n~~~", + wantHint: false, + }, + { + name: "triple asterisks inside inline code span is not flagged", + input: "the literal `***text***` marker is just a sample", + wantHint: false, + }, + { + name: "underscore-bold inside inline code is not flagged", + input: "the shape `**_italic_**` would downgrade, but only if it were real", + wantHint: false, + }, + { + name: "escaped triple asterisks rendered as literal text is not flagged", + input: `the literal \***text*** with escaped opener`, + wantHint: false, + }, + { + name: "escaped bold inside underscore-italic is not flagged", + input: `shape \*\*_text_\*\* is literal, not emphasis`, + wantHint: false, + }, + { + // Real emphasis outside the code span must still be detected — + // the strip step must not over-sanitize. + name: "real triple asterisks outside inline code still flags", + input: "real ***strong*** and literal `***keyword***` — the first one counts", + wantHint: true, + }, + { + name: "real triple asterisks outside fenced code still flags", + input: "real ***strong***\n\n```\nliteral ***keyword*** in code\n```", + wantHint: true, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { From edccf5c1c79b44e07bfb4532f2cbd77675bb707c Mon Sep 17 00:00:00 2001 From: fangshuyu-768 Date: Tue, 21 Apr 2026 11:16:32 +0800 Subject: [PATCH 2/2] fix(doc): close CommonMark gaps and add three more combined-emphasis shapes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review of the first commit turned up three issues: - isCodeFenceClose was strict on exact marker length. Per CommonMark §4.5, a closing fence must be at least as long as the opener, not exactly the same length. A 3-backtick open legitimately closed by a 4-backtick closer (used to embed triple-backticks inside the code sample) was left open-ended, causing the rest of the document to be treated as code and both checks to silently skip it. - Both fence helpers accepted any amount of leading whitespace because they ran on strings.TrimSpace(line). CommonMark allows 0..3 leading spaces before a fence marker; 4+ spaces (or any tab in leading position, which expands to 4 columns) makes the line indented code block content, not a fence open/close. Indented fence-like lines now correctly remain prose and blank lines around them are detected. - The bold/italic check only covered three of the six documented combined-emphasis shapes. Added ___text___, __*text*__, and *__text__* so parity with the asterisk variants is complete. The regex set is now table-driven (combinedEmphasisPatterns) to make adding future shapes a one-line change. Implementation changes: - New fenceIndentOK(line) helper: returns (body, true) for 0..3 leading spaces with no tabs, else (_, false). Used by both codeFenceOpenMarker and isCodeFenceClose. - isCodeFenceClose now counts the fence-char run and accepts any run length >= len(marker), with trailing whitespace only. - checkDocsUpdateBoldItalic replaced three named var regexes with a table of six {shape, re} entries and a single early-exit loop. - Updated docsUpdateWarnings top docstring to list all six shapes. - Noted the known limitation of stripEscapedEmphasisMarkers around doubled backslash escapes ("\\***text***"), which is a false negative we accept in exchange for keeping this a simple string replace. Test additions (docs_update_check_test.go): - Fence close: longer-marker close correctly ends fence; real prose blank after a longer-close fence is still detected. - Indentation: 4-space indented fence-like line is not a fence open, so a surrounding blank line still flags; tab-indented variant same; 3-space indented fence is still a real fence. - New shapes: ___text___ positive + all three negative-guards (fenced code, inline code, escaped); __*text*__ and *__text__* positive + fenced/inline negative-guards; plus two composition tests to ensure the strip does not over-sanitize across the six-regex alternative set. All 53 sub-tests in this file pass; go vet and gofmt are clean. --- shortcuts/doc/docs_update_check.go | 143 +++++++++++++++++------- shortcuts/doc/docs_update_check_test.go | 125 +++++++++++++++++++++ 2 files changed, 228 insertions(+), 40 deletions(-) diff --git a/shortcuts/doc/docs_update_check.go b/shortcuts/doc/docs_update_check.go index dba9eda0e..cf71c1012 100644 --- a/shortcuts/doc/docs_update_check.go +++ b/shortcuts/doc/docs_update_check.go @@ -14,7 +14,8 @@ import ( // commonly surprise users; the update is still executed — callers // decide whether to stop at a warning. // -// Both checks ignore fenced code blocks, inline code spans, and +// Both checks ignore fenced code blocks (```…``` and ~~~…~~~, with up +// to 3 leading spaces per CommonMark §4.5), inline code spans, and // backslash-escaped emphasis markers so that literal Markdown content // embedded in code samples or escaped prose does not produce false // positives. @@ -27,10 +28,12 @@ import ( // text. The resulting block will contain the blank line as literal // text and appear as a single paragraph in the UI. // -// 2. Lark does not round-trip bold+italic. Markdown like ***text*** or -// **_text_** / _**text**_ is stored as only one of the two emphases -// (usually italic), silently dropping the other. The user wanted -// both; they will get one. +// 2. Lark does not round-trip bold+italic. Six shapes are detected: +// ***text*** ___text___ +// **_text_** __*text*__ +// _**text**_ *__text__* +// Lark stores only one of the two emphases (usually italic), silently +// dropping the other. The user wanted both; they will get one. func docsUpdateWarnings(mode, markdown string) []string { var warnings []string if w := checkDocsUpdateReplaceMultilineMarkdown(mode, markdown); w != "" { @@ -61,16 +64,28 @@ func checkDocsUpdateReplaceMultilineMarkdown(mode, markdown string) string { "For multiple paragraphs, use --mode=delete_range followed by --mode=insert_before." } -// reBoldItalicTriple matches ***text*** with non-whitespace text between. -var reBoldItalicTriple = regexp.MustCompile(`\*\*\*\S[^*]*?\S\*\*\*|\*\*\*\S\*\*\*`) +// combinedEmphasisPatterns holds the six documented combined-emphasis shapes +// that Lark downgrades to a single emphasis. Each entry pairs a regex with a +// short shape label for the warning message. The two forms per shape (with +// and without `[^…]*?`) are there because the lazy quantifier needs at least +// one non-delimiter character to match; single-rune payloads (e.g. `***X***`) +// take the second alternation. +var combinedEmphasisPatterns = []struct { + shape string + re *regexp.Regexp +}{ + // Bold+italic with a single delimiter char. + {"***text***", regexp.MustCompile(`\*\*\*\S[^*]*?\S\*\*\*|\*\*\*\S\*\*\*`)}, + {"___text___", regexp.MustCompile(`___\S[^_]*?\S___|___\S___`)}, -// reBoldItalicUnderscoreInside matches **_text_** — bold wrapping an -// underscore italic. Same downgrade issue in Lark. -var reBoldItalicUnderscoreInside = regexp.MustCompile(`\*\*_\S[^_*]*?\S_\*\*|\*\*_\S_\*\*`) + // Bold wrapping italic (asterisk outside). + {"**_text_**", regexp.MustCompile(`\*\*_\S[^_*]*?\S_\*\*|\*\*_\S_\*\*`)}, + {"__*text*__", regexp.MustCompile(`__\*\S[^_*]*?\S\*__|__\*\S\*__`)}, -// reBoldItalicUnderscoreOutside matches _**text**_ — underscore italic -// wrapping a bold. -var reBoldItalicUnderscoreOutside = regexp.MustCompile(`_\*\*\S[^_*]*?\S\*\*_|_\*\*\S\*\*_`) + // Italic wrapping bold (asterisk inside). + {"_**text**_", regexp.MustCompile(`_\*\*\S[^_*]*?\S\*\*_|_\*\*\S\*\*_`)}, + {"*__text__*", regexp.MustCompile(`\*__\S[^_*]*?\S__\*|\*__\S__\*`)}, +} // checkDocsUpdateBoldItalic flags Markdown emphases that attempt to // combine bold and italic in a way Lark cannot represent. Fenced code @@ -82,12 +97,13 @@ func checkDocsUpdateBoldItalic(markdown string) string { return "" } sanitized := stripEscapedEmphasisMarkers(stripMarkdownCodeRegions(markdown)) - if reBoldItalicTriple.MatchString(sanitized) || - reBoldItalicUnderscoreInside.MatchString(sanitized) || - reBoldItalicUnderscoreOutside.MatchString(sanitized) { - return "Lark does not support combined bold+italic markers (***text***, **_text_**, _**text**_); " + - "the emphasis will be downgraded to either bold or italic. " + - "Split into two separate emphases or drop one of them." + for _, p := range combinedEmphasisPatterns { + if p.re.MatchString(sanitized) { + return "Lark does not support combined bold+italic markers " + + "(e.g. ***text***, ___text___, **_text_**, _**text**_, __*text*__, *__text__*); " + + "the emphasis will be downgraded to either bold or italic. " + + "Split into two separate emphases or drop one of them." + } } return "" } @@ -105,20 +121,19 @@ func proseHasBlankLine(markdown string) bool { inFence := false var fenceMarker string for i, line := range lines { - trimmed := strings.TrimSpace(line) if inFence { - if isCodeFenceClose(trimmed, fenceMarker) { + if isCodeFenceClose(line, fenceMarker) { inFence = false fenceMarker = "" } continue } - if marker := codeFenceOpenMarker(trimmed); marker != "" { + if marker := codeFenceOpenMarker(line); marker != "" { inFence = true fenceMarker = marker continue } - if trimmed == "" && i > 0 && i+1 < len(lines) { + if strings.TrimSpace(line) == "" && i > 0 && i+1 < len(lines) { return true } } @@ -134,16 +149,15 @@ func stripMarkdownCodeRegions(markdown string) string { inFence := false var fenceMarker string for i, line := range lines { - trimmed := strings.TrimSpace(line) if inFence { - if isCodeFenceClose(trimmed, fenceMarker) { + if isCodeFenceClose(line, fenceMarker) { inFence = false fenceMarker = "" } lines[i] = "" continue } - if marker := codeFenceOpenMarker(trimmed); marker != "" { + if marker := codeFenceOpenMarker(line); marker != "" { inFence = true fenceMarker = marker lines[i] = "" @@ -179,33 +193,82 @@ func maskInlineCodeSpans(line string) string { // real combined emphasis. CommonMark renders "\*" as a literal "*" with no // emphasis semantics; dropping the escape + its target from the detection // input keeps the heuristic aligned with what the renderer actually does. +// +// Known limitation: a doubled backslash escape ("\\" followed by a real +// emphasis marker, e.g. `\\***text***`) renders as a literal backslash +// followed by genuine combined emphasis, but this strip is not a proper +// parser and will instead consume the second backslash as the opener for +// another escape. That hides the real emphasis from the check, producing +// a false negative. Practical impact is small (this shape is rare in the +// kind of AI-Agent prompts we target) and the alternative — a full +// CommonMark escape parser — is not worth the code surface here. func stripEscapedEmphasisMarkers(s string) string { s = strings.ReplaceAll(s, `\*`, "") s = strings.ReplaceAll(s, `\_`, "") return s } -// codeFenceOpenMarker returns the exact fence marker (e.g. "```" or "~~~~") -// if trimmed opens a fenced code block, otherwise "". Supports any fence of -// length ≥ 3 per CommonMark §4.5. -func codeFenceOpenMarker(trimmed string) string { +// codeFenceOpenMarker returns the fence marker (e.g. "```" or "~~~~") if +// line opens a fenced code block, otherwise "". Applies CommonMark §4.5 +// rules: up to 3 leading spaces are tolerated; 4+ leading spaces (or any +// leading tab, which expands to 4 columns) make the line an indented code +// block rather than a fence. +func codeFenceOpenMarker(line string) string { + body, ok := fenceIndentOK(line) + if !ok { + return "" + } switch { - case strings.HasPrefix(trimmed, "```"): - return leadingRun(trimmed, '`') - case strings.HasPrefix(trimmed, "~~~"): - return leadingRun(trimmed, '~') + case strings.HasPrefix(body, "```"): + return leadingRun(body, '`') + case strings.HasPrefix(body, "~~~"): + return leadingRun(body, '~') } return "" } -// isCodeFenceClose reports whether trimmed closes a fence opened with -// marker. Per CommonMark, the closer must use the same fence character, -// be at least as long as the opener, and contain no info-string text. -func isCodeFenceClose(trimmed, marker string) bool { - if marker == "" || !strings.HasPrefix(trimmed, marker) { +// isCodeFenceClose reports whether line closes a fence opened with marker. +// Per CommonMark §4.5 the closer must use the same fence character, be at +// least as long as the opener, sit within 0..3 leading spaces, and carry +// no info-string text. +func isCodeFenceClose(line, marker string) bool { + if marker == "" { + return false + } + body, ok := fenceIndentOK(line) + if !ok { return false } - return strings.TrimSpace(trimmed[len(marker):]) == "" + fenceChar := marker[0] + run := leadingRun(body, fenceChar) + if len(run) < len(marker) { + return false + } + return strings.TrimSpace(body[len(run):]) == "" +} + +// fenceIndentOK returns (bodyWithoutLeadingSpaces, true) when line has +// 0..3 leading spaces and no leading tab — i.e. the indentation is +// permissible for a CommonMark fence. Returns ("", false) otherwise +// (4+ leading spaces or any tab), meaning the line must be treated as +// indented code block content rather than a fence boundary. +func fenceIndentOK(line string) (string, bool) { + for i := 0; i < len(line) && i < 4; i++ { + switch line[i] { + case ' ': + continue + case '\t': + return "", false + default: + return line[i:], true + } + } + // Reached index 4 without hitting a non-space character: too indented. + if len(line) >= 4 { + return "", false + } + // Line shorter than 4 chars and all spaces — still valid (empty content). + return "", true } // leadingRun returns the longest prefix of s made up of the byte c. diff --git a/shortcuts/doc/docs_update_check_test.go b/shortcuts/doc/docs_update_check_test.go index c71e7c313..6208e0540 100644 --- a/shortcuts/doc/docs_update_check_test.go +++ b/shortcuts/doc/docs_update_check_test.go @@ -96,6 +96,47 @@ func TestCheckDocsUpdateReplaceMultilineMarkdown(t *testing.T) { markdown: "prose before\n```go\nfmt.Println(\"hi\")\n```\nprose after", wantHint: false, }, + { + // CommonMark §4.5: the closing fence must be ≥ opening fence length. + // A 4-backtick close for a 3-backtick open is a legitimate way to + // embed triple-backticks in a code sample; the check must see the + // fence as properly closed and not treat the rest of the document + // as still-inside-fence. + name: "longer close marker closes fence correctly", + mode: "replace_range", + markdown: "```\nsome code\n````\n\nprose paragraph after", + wantHint: true, // the blank line AFTER the fence is real prose + }, + { + name: "longer close marker still hides blank line inside fence", + mode: "replace_range", + markdown: "```\nbefore\n\nafter\n````", + wantHint: false, + }, + { + // 4+ leading spaces make the line an indented code block, not a + // fence open. The "fence"-looking line is code content; the + // surrounding blank must still be detected. + name: "four-space indented fence-like line is not a fence open", + mode: "replace_range", + markdown: "first paragraph\n\n ```\n code\n ```", + wantHint: true, + }, + { + // A tab in the leading whitespace is always ≥4 columns and thus + // forces indented-code-block semantics. + name: "tab-indented fence-like line is not a fence open", + mode: "replace_range", + markdown: "first paragraph\n\n\t```\n\tcode\n\t```", + wantHint: true, + }, + { + // 3 leading spaces is still within the fence-tolerance window. + name: "three-space indented fence is still a fence", + mode: "replace_range", + markdown: " ```\ncode\n\nmore\n ```", + wantHint: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -216,6 +257,90 @@ func TestCheckDocsUpdateBoldItalic(t *testing.T) { input: "real ***strong***\n\n```\nliteral ***keyword*** in code\n```", wantHint: true, }, + // --- Triple-underscore combined emphasis: ___text___ --- + { + name: "triple underscores flagged", + input: "a ___key insight___ here", + wantHint: true, + }, + { + name: "triple underscores single char flagged", + input: "a ___X___ here", + wantHint: true, + }, + { + name: "triple underscores inside fenced code not flagged", + input: "sample:\n```\nuse ___keyword___ carefully\n```", + wantHint: false, + }, + { + name: "triple underscores inside inline code not flagged", + input: "the literal `___phrase___` marker", + wantHint: false, + }, + { + name: "escaped triple underscores not flagged", + input: `literal \___phrase___ with escaped opener`, + wantHint: false, + }, + // --- Underscore-bold wrapping asterisk-italic: __*text*__ --- + { + name: "underscore-bold wrapping asterisk-italic flagged", + input: "note: __*important*__ text", + wantHint: true, + }, + { + name: "underscore-bold wrapping asterisk-italic inside fenced code not flagged", + input: "```\nnote: __*important*__ sample\n```", + wantHint: false, + }, + { + name: "underscore-bold wrapping asterisk-italic inside inline code not flagged", + input: "literal `__*important*__` marker", + wantHint: false, + }, + // --- Asterisk-italic wrapping underscore-bold: *__text__* --- + { + name: "asterisk-italic wrapping underscore-bold flagged", + input: "note: *__phrase__* text", + wantHint: true, + }, + { + name: "asterisk-italic wrapping underscore-bold inside fenced code not flagged", + input: "```md\nnote: *__phrase__* sample\n```", + wantHint: false, + }, + // --- Positive tests: real emphasis in prose coexisting with fake in code --- + { + // Underscore-variant in prose must still fire when an asterisk + // variant appears inside a code span — verifies the strip does + // not over-sanitize across the six regex alternatives. + name: "real triple underscores outside inline code still flag when asterisk variant is in code", + input: "real ___strong___ and literal `***shape***` in code", + wantHint: true, + }, + { + // Longer close fence closes properly; real ***emphasis*** after + // the fence must fire. + name: "real emphasis after a fence closed by longer marker still flags", + input: "```\nliteral ***phrase*** in code\n````\n\nand then real ***phrase*** after", + wantHint: true, + }, + { + // 4-space indented "```" is an indented code block, not a fence + // open. The fence helper should refuse it; emphasis outside the + // (non-existent) fence must still be detected. + name: "four-space indented fence-like line does not open a fence for the emphasis check", + input: "prose\n\n ```\n not a fence\n ```\n\nreal ***strong*** here", + wantHint: true, + }, + { + // 3-space indented fence is valid per CommonMark. Emphasis inside + // must be sanitized away, so the check must not fire. + name: "three-space indented fence still hides triple-asterisk inside", + input: " ```\n literal ***text*** inside\n ```", + wantHint: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) {