diff --git a/shortcuts/doc/markdown_fix.go b/shortcuts/doc/markdown_fix.go index f9b28dccf..1ead7a61c 100644 --- a/shortcuts/doc/markdown_fix.go +++ b/shortcuts/doc/markdown_fix.go @@ -6,6 +6,8 @@ package doc import ( "regexp" "strings" + "unicode" + "unicode/utf8" ) // fixExportedMarkdown applies post-processing to Lark-exported Markdown to @@ -15,24 +17,29 @@ import ( // and strips redundant ** from ATX headings. Applied only outside fenced // code blocks, and skips inline code spans. // -// 2. fixSetextAmbiguity: inserts a blank line before any "---" that immediately +// 2. normalizeNestedListIndentation: rewrites space-pair-indented nested list +// markers to tab-indented markers. This avoids nested ordered list items +// being flattened or interpreted as plain text/code on re-import. +// +// 3. fixSetextAmbiguity: inserts a blank line before any "---" that immediately // follows a non-empty line, preventing it from being parsed as a Setext H2. // Applied only outside fenced code blocks. // -// 3. fixBlockquoteHardBreaks: inserts a blank blockquote line (">") between +// 4. fixBlockquoteHardBreaks: inserts a blank blockquote line (">") between // consecutive blockquote content lines so create-doc preserves line breaks. // Applied only outside fenced code blocks. // -// 4. fixTopLevelSoftbreaks: inserts a blank line between adjacent non-empty +// 5. fixTopLevelSoftbreaks: inserts a blank line between adjacent non-empty // lines at the top level and inside content containers (callout, // quote-container, lark-td). Code fences are left untouched, and // consecutive list items / continuations are not separated. // -// 5. fixCalloutEmoji: replaces named emoji aliases (e.g. emoji="warning") with +// 6. fixCalloutEmoji: replaces named emoji aliases (e.g. emoji="warning") with // actual Unicode emoji characters that create-doc understands. Applied only // outside fenced code blocks. func fixExportedMarkdown(md string) string { md = applyOutsideCodeFences(md, fixBoldSpacing) + md = applyOutsideCodeFences(md, normalizeNestedListIndentation) md = applyOutsideCodeFences(md, fixSetextAmbiguity) md = applyOutsideCodeFences(md, fixBlockquoteHardBreaks) md = fixTopLevelSoftbreaks(md) @@ -106,20 +113,21 @@ func fixBlockquoteHardBreaks(md string) string { return strings.Join(out, "\n") } -// fixBoldSpacing fixes two issues with bold markers exported by Lark: +// fixBoldSpacing normalizes emphasis markers exported by Lark while preserving +// inline code spans: +// +// 1. Removes leading whitespace after opening ** and * delimiters: +// "** text**" → "**text**", "* text*" → "*text*" // -// 1. Trailing whitespace before closing **: "**text **" → "**text**" -// CommonMark requires no space before a closing delimiter; otherwise the -// ** is rendered as literal text. +// 2. Removes trailing whitespace before closing ** and * delimiters: +// "**text **" → "**text**", "*text *" → "*text*" // -// 2. Redundant bold in ATX headings: "# **text**" → "# text" -// Headings are already bold, so the inner ** is visually redundant and -// some renderers display the markers literally. +// 3. Removes redundant bold around an entire ATX heading: +// "# **text**" → "# text" // -// Both fixes skip inline code spans to avoid modifying literal code content. +// The bold and italic spacing fixes only run on non-code segments so literal +// code content is left unchanged. var ( - boldTrailingSpaceRe = regexp.MustCompile(`(\*\*\S[^*]*?)\s+(\*\*)`) - italicTrailingSpaceRe = regexp.MustCompile(`(\*\S[^*]*?)\s+(\*)`) // headingBoldRe uses [^*]+ (no asterisks) to avoid mismatching headings // that contain multiple disjoint bold spans such as "# **foo** and **bar**". headingBoldRe = regexp.MustCompile(`(?m)^(#{1,6})\s+\*\*([^*]+)\*\*\s*$`) @@ -182,38 +190,116 @@ func scanInlineCodeSpans(line string) [][2]int { // fixBoldSpacingLine applies bold/italic trailing-space fixes to a single line, // skipping content inside inline code spans to avoid corrupting literal code. // ATX heading lines are also skipped here because headingBoldRe in fixBoldSpacing -// handles them separately and boldTrailingSpaceRe can misfire on headings with -// multiple disjoint bold spans (e.g. "# **foo** and **bar**"). +// handles them separately, keeping heading-only normalization isolated from the +// inline emphasis spacing scanner below. func fixBoldSpacingLine(line string) string { if atxHeadingRe.MatchString(line) { return line } spans := scanInlineCodeSpans(line) if len(spans) == 0 { - line = boldTrailingSpaceRe.ReplaceAllString(line, "$1$2") - line = italicTrailingSpaceRe.ReplaceAllString(line, "$1$2") - return line + return fixEmphasisSpacingSegment(line) } var sb strings.Builder pos := 0 for _, loc := range spans { // Process the non-code segment before this inline code span. seg := line[pos:loc[0]] - seg = boldTrailingSpaceRe.ReplaceAllString(seg, "$1$2") - seg = italicTrailingSpaceRe.ReplaceAllString(seg, "$1$2") - sb.WriteString(seg) + sb.WriteString(fixEmphasisSpacingSegment(seg)) // Preserve inline code span as-is. sb.WriteString(line[loc[0]:loc[1]]) pos = loc[1] } // Remaining non-code segment after the last code span. - seg := line[pos:] - seg = boldTrailingSpaceRe.ReplaceAllString(seg, "$1$2") - seg = italicTrailingSpaceRe.ReplaceAllString(seg, "$1$2") - sb.WriteString(seg) + sb.WriteString(fixEmphasisSpacingSegment(line[pos:])) return sb.String() } +// fixEmphasisSpacingSegment trims only the whitespace immediately inside simple +// *...* and **...** spans. It deliberately ignores runs of 3+ asterisks and +// any candidate whose payload contains another asterisk so nested emphasis-like +// text remains untouched. When both inner sides contain whitespace, single-rune +// payloads are preserved as literal text (for example "* x *" and "** x **"). +func fixEmphasisSpacingSegment(seg string) string { + if !strings.Contains(seg, "*") { + return seg + } + + var sb strings.Builder + pos := 0 + for pos < len(seg) { + openStart, openEnd, ok := nextAsteriskRun(seg, pos) + if !ok { + sb.WriteString(seg[pos:]) + break + } + + sb.WriteString(seg[pos:openStart]) + + markerLen := openEnd - openStart + if markerLen != 1 && markerLen != 2 { + sb.WriteString(seg[openStart:openEnd]) + pos = openEnd + continue + } + + closeStart, closeEnd, ok := nextAsteriskRun(seg, openEnd) + if !ok || closeEnd-closeStart != markerLen { + sb.WriteString(seg[openStart:openEnd]) + pos = openEnd + continue + } + + payload := seg[openEnd:closeStart] + normalized, shouldNormalize := normalizeEmphasisPayload(payload) + if !shouldNormalize { + sb.WriteString(seg[openStart:closeEnd]) + pos = closeEnd + continue + } + + marker := seg[openStart:openEnd] + sb.WriteString(marker) + sb.WriteString(normalized) + sb.WriteString(marker) + pos = closeEnd + } + return sb.String() +} + +func nextAsteriskRun(s string, start int) (runStart, runEnd int, ok bool) { + for i := start; i < len(s); i++ { + if s[i] != '*' { + continue + } + j := i + for j < len(s) && s[j] == '*' { + j++ + } + return i, j, true + } + return 0, 0, false +} + +func normalizeEmphasisPayload(payload string) (string, bool) { + trimmedLeft := strings.TrimLeftFunc(payload, unicode.IsSpace) + trimmed := strings.TrimRightFunc(trimmedLeft, unicode.IsSpace) + if trimmed == "" { + return payload, false + } + + hasLeadingSpace := len(trimmedLeft) != len(payload) + hasTrailingSpace := len(trimmed) != len(trimmedLeft) + if !hasLeadingSpace && !hasTrailingSpace { + return payload, true + } + + if hasLeadingSpace && hasTrailingSpace && utf8.RuneCountInString(trimmed) == 1 { + return payload, false + } + return trimmed, true +} + var setextRe = regexp.MustCompile(`(?m)^([^\n]+)\n(-{3,}\s*$)`) func fixSetextAmbiguity(md string) string { @@ -291,6 +377,44 @@ var contentContainers = [][2]string{ // indented (nested) items. var listItemRe = regexp.MustCompile(`^[ \t]*([-*+]|\d+[.)]) `) +// nestedListIndentRe matches nested list item markers indented with pairs of +// spaces. We rewrite those space pairs to tabs because some downstream +// round-trip paths treat multi-space indented ordered items as flat items or +// literal text, while tab indentation remains nested and avoids 4-space code +// block ambiguity. +var nestedListIndentRe = regexp.MustCompile(`^( {2,})([-*+]|\d+[.)]) `) + +func normalizeNestedListIndentation(md string) string { + lines := strings.Split(md, "\n") + for i, line := range lines { + matches := nestedListIndentRe.FindStringSubmatch(line) + if len(matches) != 3 { + continue + } + if !hasPreviousNonBlankListItem(lines, i) { + continue + } + indent := matches[1] + if len(indent)%2 != 0 { + continue + } + tabs := strings.Repeat("\t", len(indent)/2) + lines[i] = tabs + line[len(indent):] + } + return strings.Join(lines, "\n") +} + +func hasPreviousNonBlankListItem(lines []string, index int) bool { + for i := index - 1; i >= 0; i-- { + trimmed := strings.TrimSpace(lines[i]) + if trimmed == "" { + return false + } + return listItemRe.MatchString(lines[i]) + } + return false +} + // isListItemOrContinuation returns true for lines that are part of a list: // either a list item marker line or an indented continuation of a list item. // This is used to prevent blank lines being inserted between tight list lines, diff --git a/shortcuts/doc/markdown_fix_test.go b/shortcuts/doc/markdown_fix_test.go index b47ab7785..81ac26a9a 100644 --- a/shortcuts/doc/markdown_fix_test.go +++ b/shortcuts/doc/markdown_fix_test.go @@ -14,6 +14,56 @@ func TestFixBoldSpacing(t *testing.T) { input string want string }{ + { + name: "leading space after opening bold", + input: "** hello**", + want: "**hello**", + }, + { + name: "leading space after opening italic", + input: "* hello*", + want: "*hello*", + }, + { + name: "leading and trailing spaces inside bold are collapsed", + input: "** hello **", + want: "**hello**", + }, + { + name: "leading and trailing spaces inside italic are collapsed", + input: "* hello *", + want: "*hello*", + }, + { + name: "multiple spaced italic spans on one line are each collapsed", + input: "* a* * b*", + want: "*a* *b*", + }, + { + name: "ambiguous italic span stays literal", + input: "2 * x * y", + want: "2 * x * y", + }, + { + name: "ambiguous bold span stays literal", + input: "2 ** x ** y", + want: "2 ** x ** y", + }, + { + name: "single-rune italic with spaces on both sides stays literal", + input: "* x *", + want: "* x *", + }, + { + name: "single-rune bold with spaces on both sides stays literal", + input: "** x **", + want: "** x **", + }, + { + name: "triple-asterisk near miss stays literal", + input: "*** hello**", + want: "*** hello**", + }, { name: "trailing space before closing bold", input: "**hello **", @@ -54,6 +104,16 @@ func TestFixBoldSpacing(t *testing.T) { input: "**foo ** and `**bar **`", want: "**foo** and `**bar **`", }, + { + name: "inline code with spaced italic stays literal while outside span is fixed", + input: "`* hello *` and * hello *", + want: "`* hello *` and *hello*", + }, + { + name: "opening space inside text tag fixed", + input: `** Helpful - 有用性:**`, + want: `**Helpful - 有用性:**`, + }, { name: "double-backtick inline code not modified", input: "``**hello **`` and **world **", @@ -222,6 +282,53 @@ func TestFixTopLevelSoftbreaks(t *testing.T) { } } +func TestNormalizeNestedListIndentation(t *testing.T) { + tests := []struct { + name string + input string + want string + }{ + { + name: "nested ordered list uses tabs instead of space pairs", + input: "1. parent\n 1. child\n 1. grandchild", + want: "1. parent\n\t1. child\n\t\t1. grandchild", + }, + { + name: "nested mixed list markers use tabs instead of space pairs", + input: "- parent\n - child\n 1. grandchild", + want: "- parent\n\t- child\n\t\t1. grandchild", + }, + { + name: "top-level list unchanged", + input: "1. parent\n2. sibling", + want: "1. parent\n2. sibling", + }, + { + name: "indented top-level marker without parent list stays unchanged", + input: "paragraph\n\n 1. item", + want: "paragraph\n\n 1. item", + }, + { + name: "blank-line-separated loose-list sibling stays unchanged", + input: "1. a\n\n 1. b", + want: "1. a\n\n 1. b", + }, + { + name: "indented code block inside list item stays unchanged", + input: "- parent\n\n 1. code", + want: "- parent\n\n 1. code", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := normalizeNestedListIndentation(tt.input) + if got != tt.want { + t.Errorf("normalizeNestedListIndentation(%q) = %q, want %q", tt.input, got, tt.want) + } + }) + } +} + func TestFixExportedMarkdown(t *testing.T) { // End-to-end: all fixes applied together input := "# **Title**\nparagraph one\nparagraph two\n**bold **\n> q1\n> q2\nsome text\n---"