diff --git a/shortcuts/doc/markdown_fix.go b/shortcuts/doc/markdown_fix.go index 1ead7a61c..f72da6e0d 100644 --- a/shortcuts/doc/markdown_fix.go +++ b/shortcuts/doc/markdown_fix.go @@ -13,31 +13,36 @@ import ( // fixExportedMarkdown applies post-processing to Lark-exported Markdown to // improve round-trip fidelity on re-import: // -// 1. fixBoldSpacing: removes trailing whitespace before closing ** / *, +// 1. fixLarkTables: converts Feishu XML tables (, , +// ) to standard Markdown tables. Applied only outside fenced +// code blocks. Tables with merged cells (colspan/rowspan) are skipped. +// +// 2. fixBoldSpacing: removes trailing whitespace before closing ** / *, // and strips redundant ** from ATX headings. Applied only outside fenced // code blocks, and skips inline code spans. // -// 2. normalizeNestedListIndentation: rewrites space-pair-indented nested list +// 3. normalizeNestedListIndentation: rewrites space-pair-indented nested list // markers to tab-indented markers. This avoids nested ordered list items // being flattened or interpreted as plain text/code on re-import. // -// 3. fixSetextAmbiguity: inserts a blank line before any "---" that immediately +// 4. fixSetextAmbiguity: inserts a blank line before any "---" that immediately // follows a non-empty line, preventing it from being parsed as a Setext H2. // Applied only outside fenced code blocks. // -// 4. fixBlockquoteHardBreaks: inserts a blank blockquote line (">") between +// 5. fixBlockquoteHardBreaks: inserts a blank blockquote line (">") between // consecutive blockquote content lines so create-doc preserves line breaks. // Applied only outside fenced code blocks. // -// 5. fixTopLevelSoftbreaks: inserts a blank line between adjacent non-empty +// 6. fixTopLevelSoftbreaks: inserts a blank line between adjacent non-empty // lines at the top level and inside content containers (callout, // quote-container, lark-td). Code fences are left untouched, and // consecutive list items / continuations are not separated. // -// 6. fixCalloutEmoji: replaces named emoji aliases (e.g. emoji="warning") with +// 7. fixCalloutEmoji: replaces named emoji aliases (e.g. emoji="warning") with // actual Unicode emoji characters that create-doc understands. Applied only // outside fenced code blocks. func fixExportedMarkdown(md string) string { + md = applyOutsideCodeFences(md, fixLarkTables) md = applyOutsideCodeFences(md, fixBoldSpacing) md = applyOutsideCodeFences(md, normalizeNestedListIndentation) md = applyOutsideCodeFences(md, fixSetextAmbiguity) @@ -538,3 +543,66 @@ func fixTopLevelSoftbreaks(md string) string { return strings.Join(out, "\n") } + +// fixLarkTables converts Feishu/Lark XML tables to standard Markdown tables. +// It handles: +// - Simple tables with rows () and cells () +// - Empty cells +// - Cells containing pipe characters (escaped as \|) +// - Multiline cell content (converted to
) +// - Tables with merged cells are skipped (colspan/rowspan attributes) +func fixLarkTables(md string) string { + // Match entire ... blocks + tableRe := regexp.MustCompile(`(?s)]*>(.*?)
`) + return tableRe.ReplaceAllStringFunc(md, func(tableMatch string) string { + // Check for merged cells - if present, skip conversion and keep XML + if strings.Contains(tableMatch, "colspan=") || strings.Contains(tableMatch, "rowspan=") { + return tableMatch + } + + // Extract all rows + rowRe := regexp.MustCompile(`(?s)]*>(.*?)`) + rows := rowRe.FindAllStringSubmatch(tableMatch, -1) + if len(rows) == 0 { + return tableMatch + } + + var mdRows []string + colCount := 0 + + for _, row := range rows { + cellRe := regexp.MustCompile(`(?s)]*>(.*?)`) + cells := cellRe.FindAllStringSubmatch(row[1], -1) + if len(cells) == 0 { + continue + } + + var cellContents []string + for _, cell := range cells { + content := strings.TrimSpace(cell[1]) + // Handle multiline content + content = strings.ReplaceAll(content, "\n", "
") + // Escape pipe characters + content = strings.ReplaceAll(content, "|", `\|`) + cellContents = append(cellContents, content) + } + + mdRows = append(mdRows, "| "+strings.Join(cellContents, " | ")+" |") + if len(cellContents) > colCount { + colCount = len(cellContents) + } + } + + if len(mdRows) == 0 { + return tableMatch + } + + // Build separator row after the first row (header) + separator := "|" + strings.Repeat(" --- |", colCount) + if len(mdRows) > 0 { + mdRows = append([]string{mdRows[0], separator}, mdRows[1:]...) + } + + return strings.Join(mdRows, "\n") + }) +} diff --git a/shortcuts/doc/markdown_fix_hardening_test.go b/shortcuts/doc/markdown_fix_hardening_test.go index 36264f876..a7ef284c8 100644 --- a/shortcuts/doc/markdown_fix_hardening_test.go +++ b/shortcuts/doc/markdown_fix_hardening_test.go @@ -70,6 +70,20 @@ func TestFixExportedMarkdownIdempotent(t *testing.T) { "", "", }, "\n"), + + "lark-table converted to markdown": strings.Join([]string{ + "", + "", + "Header", + "Value", + "", + "", + "Data", + "123", + "", + "", + "", + }, "\n"), } for name, fixture := range fixtures { diff --git a/shortcuts/doc/markdown_fix_test.go b/shortcuts/doc/markdown_fix_test.go index 81ac26a9a..853d556ab 100644 --- a/shortcuts/doc/markdown_fix_test.go +++ b/shortcuts/doc/markdown_fix_test.go @@ -438,3 +438,174 @@ func TestFixTopLevelSoftbreaksQuoteContainer(t *testing.T) { t.Errorf("fixTopLevelSoftbreaks quote-container = %q, want %q", got, want) } } + +func TestFixLarkTables(t *testing.T) { + tests := []struct { + name string + input string + want string + }{ + { + name: "simple 2x2 table", + input: ` + +Header 1 +Header 2 + + +Cell 1 +Cell 2 + +`, + want: `| Header 1 | Header 2 | +| --- | --- | +| Cell 1 | Cell 2 |`, + }, + { + name: "table with empty cells", + input: ` + +A + + + + +B + +`, + want: `| A | | +| --- | --- | +| | B |`, + }, + { + name: "table with pipe character escaped", + input: ` + +a|b +c|d + +`, + want: `| a\|b | c\|d | +| --- | --- |`, + }, + { + name: "table with multiline content", + input: ` + +line1 +line2 +single + +`, + want: `| line1
line2 | single | +| --- | --- |`, + }, + { + name: "table with merged cells not converted", + input: ` + +merged + +`, + want: ` + +merged + +`, + }, + { + name: "table inside code block not converted", + input: "```\n\n\ncell\n\n\n```", + want: "```\n\n\ncell\n\n\n```", + }, + { + name: "multiple tables in document", + input: `# Title + + + +A + + + +More text. + + + +B + +`, + want: `# Title + +| A | +| --- | + +More text. + +| B | +| --- |`, + }, + { + name: "table with whitespace in cells", + input: ` + + + content + + +`, + want: `| content | +| --- |`, + }, + { + name: "table with attributes on tags", + input: ` + +Data + +`, + want: `| Data | +| --- |`, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := applyOutsideCodeFences(tt.input, fixLarkTables) + if got != tt.want { + t.Errorf("fixLarkTables(%q) = %q, want %q", tt.input, got, tt.want) + } + }) + } +} + +func TestFixLarkTablesIntegrated(t *testing.T) { + // Test that fixLarkTables is applied in fixExportedMarkdown + input := `# Document + + + +Header + + +Data + + + +End.` + result := fixExportedMarkdown(input) + + // Should contain markdown table format + if !strings.Contains(result, "| Header |") { + t.Error("expected markdown table header in output") + } + if !strings.Contains(result, "| --- |") { + t.Error("expected markdown table separator in output") + } + if !strings.Contains(result, "| Data |") { + t.Error("expected markdown table data row in output") + } + // Should NOT contain XML tags + if strings.Contains(result, "") { + t.Error("expected lark-table XML to be converted") + } +}