Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 150 additions & 26 deletions shortcuts/doc/markdown_fix.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import (
"regexp"
"strings"
"unicode"
"unicode/utf8"
)

// fixExportedMarkdown applies post-processing to Lark-exported Markdown to
Expand All @@ -15,24 +17,29 @@
// and strips redundant ** from ATX headings. Applied only outside fenced
// code blocks, and skips inline code spans.
//
// 2. fixSetextAmbiguity: inserts a blank line before any "---" that immediately
// 2. normalizeNestedListIndentation: rewrites space-pair-indented nested list
// markers to tab-indented markers. This avoids nested ordered list items
// being flattened or interpreted as plain text/code on re-import.
//
// 3. fixSetextAmbiguity: inserts a blank line before any "---" that immediately
// follows a non-empty line, preventing it from being parsed as a Setext H2.
// Applied only outside fenced code blocks.
//
// 3. fixBlockquoteHardBreaks: inserts a blank blockquote line (">") between
// 4. fixBlockquoteHardBreaks: inserts a blank blockquote line (">") between
// consecutive blockquote content lines so create-doc preserves line breaks.
// Applied only outside fenced code blocks.
//
// 4. fixTopLevelSoftbreaks: inserts a blank line between adjacent non-empty
// 5. fixTopLevelSoftbreaks: inserts a blank line between adjacent non-empty
// lines at the top level and inside content containers (callout,
// quote-container, lark-td). Code fences are left untouched, and
// consecutive list items / continuations are not separated.
//
// 5. fixCalloutEmoji: replaces named emoji aliases (e.g. emoji="warning") with
// 6. fixCalloutEmoji: replaces named emoji aliases (e.g. emoji="warning") with
// actual Unicode emoji characters that create-doc understands. Applied only
// outside fenced code blocks.
func fixExportedMarkdown(md string) string {
md = applyOutsideCodeFences(md, fixBoldSpacing)
md = applyOutsideCodeFences(md, normalizeNestedListIndentation)
md = applyOutsideCodeFences(md, fixSetextAmbiguity)
md = applyOutsideCodeFences(md, fixBlockquoteHardBreaks)
md = fixTopLevelSoftbreaks(md)
Expand Down Expand Up @@ -106,20 +113,21 @@
return strings.Join(out, "\n")
}

// fixBoldSpacing fixes two issues with bold markers exported by Lark:
// fixBoldSpacing normalizes emphasis markers exported by Lark while preserving
// inline code spans:
//
// 1. Removes leading whitespace after opening ** and * delimiters:
// "** text**" → "**text**", "* text*" → "*text*"
//
// 1. Trailing whitespace before closing **: "**text **" → "**text**"
// CommonMark requires no space before a closing delimiter; otherwise the
// ** is rendered as literal text.
// 2. Removes trailing whitespace before closing ** and * delimiters:
// "**text **" → "**text**", "*text *" → "*text*"
//
// 2. Redundant bold in ATX headings: "# **text**" → "# text"
// Headings are already bold, so the inner ** is visually redundant and
// some renderers display the markers literally.
// 3. Removes redundant bold around an entire ATX heading:
// "# **text**" → "# text"
//
// Both fixes skip inline code spans to avoid modifying literal code content.
// The bold and italic spacing fixes only run on non-code segments so literal
// code content is left unchanged.
var (
boldTrailingSpaceRe = regexp.MustCompile(`(\*\*\S[^*]*?)\s+(\*\*)`)
italicTrailingSpaceRe = regexp.MustCompile(`(\*\S[^*]*?)\s+(\*)`)
// headingBoldRe uses [^*]+ (no asterisks) to avoid mismatching headings
// that contain multiple disjoint bold spans such as "# **foo** and **bar**".
headingBoldRe = regexp.MustCompile(`(?m)^(#{1,6})\s+\*\*([^*]+)\*\*\s*$`)
Expand Down Expand Up @@ -182,38 +190,116 @@
// fixBoldSpacingLine applies bold/italic trailing-space fixes to a single line,
// skipping content inside inline code spans to avoid corrupting literal code.
// ATX heading lines are also skipped here because headingBoldRe in fixBoldSpacing
// handles them separately and boldTrailingSpaceRe can misfire on headings with
// multiple disjoint bold spans (e.g. "# **foo** and **bar**").
// handles them separately, keeping heading-only normalization isolated from the
// inline emphasis spacing scanner below.
func fixBoldSpacingLine(line string) string {
if atxHeadingRe.MatchString(line) {
return line
}
spans := scanInlineCodeSpans(line)
if len(spans) == 0 {
line = boldTrailingSpaceRe.ReplaceAllString(line, "$1$2")
line = italicTrailingSpaceRe.ReplaceAllString(line, "$1$2")
return line
return fixEmphasisSpacingSegment(line)
}
var sb strings.Builder
pos := 0
for _, loc := range spans {
// Process the non-code segment before this inline code span.
seg := line[pos:loc[0]]
seg = boldTrailingSpaceRe.ReplaceAllString(seg, "$1$2")
seg = italicTrailingSpaceRe.ReplaceAllString(seg, "$1$2")
sb.WriteString(seg)
sb.WriteString(fixEmphasisSpacingSegment(seg))
// Preserve inline code span as-is.
sb.WriteString(line[loc[0]:loc[1]])
pos = loc[1]
}
// Remaining non-code segment after the last code span.
seg := line[pos:]
seg = boldTrailingSpaceRe.ReplaceAllString(seg, "$1$2")
seg = italicTrailingSpaceRe.ReplaceAllString(seg, "$1$2")
sb.WriteString(seg)
sb.WriteString(fixEmphasisSpacingSegment(line[pos:]))
return sb.String()
}

// fixEmphasisSpacingSegment trims only the whitespace immediately inside simple
// *...* and **...** spans. It deliberately ignores runs of 3+ asterisks and
// any candidate whose payload contains another asterisk so nested emphasis-like
// text remains untouched. When both inner sides contain whitespace, single-rune
// payloads are preserved as literal text (for example "* x *" and "** x **").
func fixEmphasisSpacingSegment(seg string) string {
if !strings.Contains(seg, "*") {
return seg
}

var sb strings.Builder
pos := 0
for pos < len(seg) {
openStart, openEnd, ok := nextAsteriskRun(seg, pos)
if !ok {
sb.WriteString(seg[pos:])
break
}

sb.WriteString(seg[pos:openStart])

markerLen := openEnd - openStart
if markerLen != 1 && markerLen != 2 {
sb.WriteString(seg[openStart:openEnd])
pos = openEnd
continue
}

closeStart, closeEnd, ok := nextAsteriskRun(seg, openEnd)
if !ok || closeEnd-closeStart != markerLen {
sb.WriteString(seg[openStart:openEnd])
pos = openEnd
continue
}

payload := seg[openEnd:closeStart]
normalized, shouldNormalize := normalizeEmphasisPayload(payload)
if !shouldNormalize {
sb.WriteString(seg[openStart:closeEnd])
pos = closeEnd
continue
}

marker := seg[openStart:openEnd]
sb.WriteString(marker)
sb.WriteString(normalized)
sb.WriteString(marker)
pos = closeEnd
}
return sb.String()
}

func nextAsteriskRun(s string, start int) (runStart, runEnd int, ok bool) {
for i := start; i < len(s); i++ {
if s[i] != '*' {
continue
}
j := i
for j < len(s) && s[j] == '*' {
j++
}
return i, j, true
}
return 0, 0, false
}

func normalizeEmphasisPayload(payload string) (string, bool) {
trimmedLeft := strings.TrimLeftFunc(payload, unicode.IsSpace)
trimmed := strings.TrimRightFunc(trimmedLeft, unicode.IsSpace)
if trimmed == "" {
return payload, false

Check warning on line 288 in shortcuts/doc/markdown_fix.go

View check run for this annotation

Codecov / codecov/patch

shortcuts/doc/markdown_fix.go#L288

Added line #L288 was not covered by tests
}

hasLeadingSpace := len(trimmedLeft) != len(payload)
hasTrailingSpace := len(trimmed) != len(trimmedLeft)
if !hasLeadingSpace && !hasTrailingSpace {
return payload, true
}

if hasLeadingSpace && hasTrailingSpace && utf8.RuneCountInString(trimmed) == 1 {
return payload, false
}
return trimmed, true
}

var setextRe = regexp.MustCompile(`(?m)^([^\n]+)\n(-{3,}\s*$)`)

func fixSetextAmbiguity(md string) string {
Expand Down Expand Up @@ -291,6 +377,44 @@
// indented (nested) items.
var listItemRe = regexp.MustCompile(`^[ \t]*([-*+]|\d+[.)]) `)

// nestedListIndentRe matches nested list item markers indented with pairs of
// spaces. We rewrite those space pairs to tabs because some downstream
// round-trip paths treat multi-space indented ordered items as flat items or
// literal text, while tab indentation remains nested and avoids 4-space code
// block ambiguity.
var nestedListIndentRe = regexp.MustCompile(`^( {2,})([-*+]|\d+[.)]) `)

func normalizeNestedListIndentation(md string) string {
lines := strings.Split(md, "\n")
for i, line := range lines {
matches := nestedListIndentRe.FindStringSubmatch(line)
Comment thread
fangshuyu-768 marked this conversation as resolved.
if len(matches) != 3 {
continue
}
if !hasPreviousNonBlankListItem(lines, i) {
Comment thread
fangshuyu-768 marked this conversation as resolved.
continue
}
indent := matches[1]
if len(indent)%2 != 0 {
continue

Check warning on line 399 in shortcuts/doc/markdown_fix.go

View check run for this annotation

Codecov / codecov/patch

shortcuts/doc/markdown_fix.go#L399

Added line #L399 was not covered by tests
}
tabs := strings.Repeat("\t", len(indent)/2)
lines[i] = tabs + line[len(indent):]
}
return strings.Join(lines, "\n")
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

func hasPreviousNonBlankListItem(lines []string, index int) bool {
for i := index - 1; i >= 0; i-- {
trimmed := strings.TrimSpace(lines[i])
if trimmed == "" {
return false
}
return listItemRe.MatchString(lines[i])
}
return false

Check warning on line 415 in shortcuts/doc/markdown_fix.go

View check run for this annotation

Codecov / codecov/patch

shortcuts/doc/markdown_fix.go#L415

Added line #L415 was not covered by tests
}

// isListItemOrContinuation returns true for lines that are part of a list:
// either a list item marker line or an indented continuation of a list item.
// This is used to prevent blank lines being inserted between tight list lines,
Expand Down
107 changes: 107 additions & 0 deletions shortcuts/doc/markdown_fix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,56 @@ func TestFixBoldSpacing(t *testing.T) {
input string
want string
}{
{
name: "leading space after opening bold",
input: "** hello**",
want: "**hello**",
},
{
name: "leading space after opening italic",
input: "* hello*",
want: "*hello*",
},
{
name: "leading and trailing spaces inside bold are collapsed",
input: "** hello **",
want: "**hello**",
},
{
name: "leading and trailing spaces inside italic are collapsed",
input: "* hello *",
want: "*hello*",
},
{
name: "multiple spaced italic spans on one line are each collapsed",
input: "* a* * b*",
want: "*a* *b*",
},
{
name: "ambiguous italic span stays literal",
input: "2 * x * y",
want: "2 * x * y",
},
{
name: "ambiguous bold span stays literal",
input: "2 ** x ** y",
want: "2 ** x ** y",
},
{
name: "single-rune italic with spaces on both sides stays literal",
input: "* x *",
want: "* x *",
},
{
name: "single-rune bold with spaces on both sides stays literal",
input: "** x **",
want: "** x **",
},
{
name: "triple-asterisk near miss stays literal",
input: "*** hello**",
want: "*** hello**",
},
{
name: "trailing space before closing bold",
input: "**hello **",
Expand Down Expand Up @@ -54,6 +104,16 @@ func TestFixBoldSpacing(t *testing.T) {
input: "**foo ** and `**bar **`",
want: "**foo** and `**bar **`",
},
{
name: "inline code with spaced italic stays literal while outside span is fixed",
input: "`* hello *` and * hello *",
want: "`* hello *` and *hello*",
},
{
name: "opening space inside text tag fixed",
input: `<text color="red">** Helpful - 有用性:**</text>`,
want: `<text color="red">**Helpful - 有用性:**</text>`,
},
{
name: "double-backtick inline code not modified",
input: "``**hello **`` and **world **",
Expand Down Expand Up @@ -222,6 +282,53 @@ func TestFixTopLevelSoftbreaks(t *testing.T) {
}
}

func TestNormalizeNestedListIndentation(t *testing.T) {
Comment thread
fangshuyu-768 marked this conversation as resolved.
tests := []struct {
name string
input string
want string
}{
{
name: "nested ordered list uses tabs instead of space pairs",
input: "1. parent\n 1. child\n 1. grandchild",
want: "1. parent\n\t1. child\n\t\t1. grandchild",
},
{
name: "nested mixed list markers use tabs instead of space pairs",
input: "- parent\n - child\n 1. grandchild",
want: "- parent\n\t- child\n\t\t1. grandchild",
},
{
name: "top-level list unchanged",
input: "1. parent\n2. sibling",
want: "1. parent\n2. sibling",
},
{
name: "indented top-level marker without parent list stays unchanged",
input: "paragraph\n\n 1. item",
want: "paragraph\n\n 1. item",
},
{
name: "blank-line-separated loose-list sibling stays unchanged",
input: "1. a\n\n 1. b",
want: "1. a\n\n 1. b",
},
{
name: "indented code block inside list item stays unchanged",
input: "- parent\n\n 1. code",
want: "- parent\n\n 1. code",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := normalizeNestedListIndentation(tt.input)
if got != tt.want {
t.Errorf("normalizeNestedListIndentation(%q) = %q, want %q", tt.input, got, tt.want)
}
})
}
}

func TestFixExportedMarkdown(t *testing.T) {
// End-to-end: all fixes applied together
input := "# **Title**\nparagraph one\nparagraph two\n**bold **\n> q1\n> q2\nsome text\n---"
Expand Down
Loading