Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 74 additions & 6 deletions shortcuts/doc/markdown_fix.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,36 @@ import (
// fixExportedMarkdown applies post-processing to Lark-exported Markdown to
// improve round-trip fidelity on re-import:
//
// 1. fixBoldSpacing: removes trailing whitespace before closing ** / *,
// 1. fixLarkTables: converts Feishu XML tables (<lark-table>, <lark-tr>,
// <lark-td>) to standard Markdown tables. Applied only outside fenced
// code blocks. Tables with merged cells (colspan/rowspan) are skipped.
//
// 2. fixBoldSpacing: removes trailing whitespace before closing ** / *,
// and strips redundant ** from ATX headings. Applied only outside fenced
// code blocks, and skips inline code spans.
//
// 2. normalizeNestedListIndentation: rewrites space-pair-indented nested list
// 3. normalizeNestedListIndentation: rewrites space-pair-indented nested list
// markers to tab-indented markers. This avoids nested ordered list items
// being flattened or interpreted as plain text/code on re-import.
//
// 3. fixSetextAmbiguity: inserts a blank line before any "---" that immediately
// 4. fixSetextAmbiguity: inserts a blank line before any "---" that immediately
// follows a non-empty line, preventing it from being parsed as a Setext H2.
// Applied only outside fenced code blocks.
//
// 4. fixBlockquoteHardBreaks: inserts a blank blockquote line (">") between
// 5. fixBlockquoteHardBreaks: inserts a blank blockquote line (">") between
// consecutive blockquote content lines so create-doc preserves line breaks.
// Applied only outside fenced code blocks.
//
// 5. fixTopLevelSoftbreaks: inserts a blank line between adjacent non-empty
// 6. fixTopLevelSoftbreaks: inserts a blank line between adjacent non-empty
// lines at the top level and inside content containers (callout,
// quote-container, lark-td). Code fences are left untouched, and
// consecutive list items / continuations are not separated.
//
// 6. fixCalloutEmoji: replaces named emoji aliases (e.g. emoji="warning") with
// 7. fixCalloutEmoji: replaces named emoji aliases (e.g. emoji="warning") with
// actual Unicode emoji characters that create-doc understands. Applied only
// outside fenced code blocks.
func fixExportedMarkdown(md string) string {
md = applyOutsideCodeFences(md, fixLarkTables)
md = applyOutsideCodeFences(md, fixBoldSpacing)
md = applyOutsideCodeFences(md, normalizeNestedListIndentation)
md = applyOutsideCodeFences(md, fixSetextAmbiguity)
Expand Down Expand Up @@ -538,3 +543,66 @@ func fixTopLevelSoftbreaks(md string) string {

return strings.Join(out, "\n")
}

// fixLarkTables converts Feishu/Lark XML tables to standard Markdown tables.
// It handles:
// - Simple tables with rows (<lark-tr>) and cells (<lark-td>)
// - Empty cells
// - Cells containing pipe characters (escaped as \|)
// - Multiline cell content (converted to <br/>)
// - Tables with merged cells are skipped (colspan/rowspan attributes)
func fixLarkTables(md string) string {
// Match entire <lark-table>...</lark-table> blocks
tableRe := regexp.MustCompile(`(?s)<lark-table[^>]*>(.*?)</lark-table>`)
return tableRe.ReplaceAllStringFunc(md, func(tableMatch string) string {
// Check for merged cells - if present, skip conversion and keep XML
if strings.Contains(tableMatch, "colspan=") || strings.Contains(tableMatch, "rowspan=") {
return tableMatch
Comment on lines +558 to +560
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Detect merged-cell attributes with XML-compatible spacing/case.

The current exact substring check misses merged cells like colspan = "2" or ROWSPAN="2", so those tables would be converted instead of skipped.

🛡️ Proposed robust merged-cell detection
+var larkMergedCellAttrRe = regexp.MustCompile(`(?i)\b(?:colspan|rowspan)\s*=`)
+
 func fixLarkTables(md string) string {
 	// Match entire <lark-table>...</lark-table> blocks
 	tableRe := regexp.MustCompile(`(?s)<lark-table[^>]*>(.*?)</lark-table>`)
 	return tableRe.ReplaceAllStringFunc(md, func(tableMatch string) string {
 		// Check for merged cells - if present, skip conversion and keep XML
-		if strings.Contains(tableMatch, "colspan=") || strings.Contains(tableMatch, "rowspan=") {
+		if larkMergedCellAttrRe.MatchString(tableMatch) {
 			return tableMatch
 		}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@shortcuts/doc/markdown_fix.go` around lines 558 - 560, The current exact
substring checks on tableMatch miss cases like `colspan = "2"` or `ROWSPAN="2"`;
update the merged-cell detection to use a case-insensitive regex that allows
optional whitespace around the equals sign (for example
`(?i)\b(colspan|rowspan)\b\s*=`) instead of the two strings.Contains checks.
Replace the block that tests tableMatch for "colspan=" and "rowspan=" with a
precompiled regexp (e.g. regexp.MustCompile) and use its MatchString on
tableMatch to decide whether to return tableMatch unchanged.

}

// Extract all rows
rowRe := regexp.MustCompile(`(?s)<lark-tr[^>]*>(.*?)</lark-tr>`)
rows := rowRe.FindAllStringSubmatch(tableMatch, -1)
if len(rows) == 0 {
return tableMatch
}

var mdRows []string
colCount := 0

for _, row := range rows {
cellRe := regexp.MustCompile(`(?s)<lark-td[^>]*>(.*?)</lark-td>`)
cells := cellRe.FindAllStringSubmatch(row[1], -1)
if len(cells) == 0 {
continue
}

var cellContents []string
for _, cell := range cells {
content := strings.TrimSpace(cell[1])
// Handle multiline content
content = strings.ReplaceAll(content, "\n", "<br/>")
// Escape pipe characters
content = strings.ReplaceAll(content, "|", `\|`)
cellContents = append(cellContents, content)
}

mdRows = append(mdRows, "| "+strings.Join(cellContents, " | ")+" |")
if len(cellContents) > colCount {
colCount = len(cellContents)
}
}

if len(mdRows) == 0 {
return tableMatch
}

// Build separator row after the first row (header)
separator := "|" + strings.Repeat(" --- |", colCount)
if len(mdRows) > 0 {
mdRows = append([]string{mdRows[0], separator}, mdRows[1:]...)
}

return strings.Join(mdRows, "\n")
Comment on lines +582 to +606
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Preserve CRLF line endings during table conversion.

For CRLF documents, this path emits converted table rows with \n and converts multiline cells as line1\r<br/>line2, which violates the existing CRLF preservation invariant for exported Markdown.

🧩 Proposed CRLF-safe conversion
+		lineEnding := "\n"
+		if strings.Contains(tableMatch, "\r\n") {
+			lineEnding = "\r\n"
+		}
+
 		var mdRows []string
 		colCount := 0
 
 		for _, row := range rows {
@@
 			var cellContents []string
 			for _, cell := range cells {
 				content := strings.TrimSpace(cell[1])
 				// Handle multiline content
+				content = strings.ReplaceAll(content, "\r\n", "<br/>")
 				content = strings.ReplaceAll(content, "\n", "<br/>")
+				content = strings.ReplaceAll(content, "\r", "<br/>")
 				// Escape pipe characters
 				content = strings.ReplaceAll(content, "|", `\|`)
 				cellContents = append(cellContents, content)
 			}
@@
-		return strings.Join(mdRows, "\n")
+		return strings.Join(mdRows, lineEnding)
 	})
 }
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@shortcuts/doc/markdown_fix.go` around lines 582 - 606, Detect and preserve
the original EOL when converting the table: check for CRLF by testing if
tableMatch contains "\r\n" and set a local eol variable (e.g., eol := "\n" or
"\r\n"); when transforming multiline cell content in the content variable,
replace "\r\n" first with "<br/>"+eol and then replace remaining "\n" with
"<br/>"+eol (so you don't leave a stray '\r' before the <br/>); finally use
strings.Join(mdRows, eol) when returning the assembled table instead of joining
with "\n" so mdRows, separator, and the overall return preserve the document's
original CRLF or LF; update references in this block using content, mdRows,
separator, colCount, and tableMatch.

})
}
14 changes: 14 additions & 0 deletions shortcuts/doc/markdown_fix_hardening_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,20 @@ func TestFixExportedMarkdownIdempotent(t *testing.T) {
"</quote-container>",
"",
}, "\n"),

"lark-table converted to markdown": strings.Join([]string{
"<lark-table>",
"<lark-tr>",
"<lark-td>Header</lark-td>",
"<lark-td>Value</lark-td>",
"</lark-tr>",
"<lark-tr>",
"<lark-td>Data</lark-td>",
"<lark-td>123</lark-td>",
"</lark-tr>",
"</lark-table>",
"",
}, "\n"),
}

for name, fixture := range fixtures {
Expand Down
171 changes: 171 additions & 0 deletions shortcuts/doc/markdown_fix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -438,3 +438,174 @@ func TestFixTopLevelSoftbreaksQuoteContainer(t *testing.T) {
t.Errorf("fixTopLevelSoftbreaks quote-container = %q, want %q", got, want)
}
}

func TestFixLarkTables(t *testing.T) {
tests := []struct {
name string
input string
want string
}{
{
name: "simple 2x2 table",
input: `<lark-table>
<lark-tr>
<lark-td>Header 1</lark-td>
<lark-td>Header 2</lark-td>
</lark-tr>
<lark-tr>
<lark-td>Cell 1</lark-td>
<lark-td>Cell 2</lark-td>
</lark-tr>
</lark-table>`,
want: `| Header 1 | Header 2 |
| --- | --- |
| Cell 1 | Cell 2 |`,
},
{
name: "table with empty cells",
input: `<lark-table>
<lark-tr>
<lark-td>A</lark-td>
<lark-td></lark-td>
</lark-tr>
<lark-tr>
<lark-td></lark-td>
<lark-td>B</lark-td>
</lark-tr>
</lark-table>`,
want: `| A | |
| --- | --- |
| | B |`,
},
{
name: "table with pipe character escaped",
input: `<lark-table>
<lark-tr>
<lark-td>a|b</lark-td>
<lark-td>c|d</lark-td>
</lark-tr>
</lark-table>`,
want: `| a\|b | c\|d |
| --- | --- |`,
},
{
name: "table with multiline content",
input: `<lark-table>
<lark-tr>
<lark-td>line1
line2</lark-td>
<lark-td>single</lark-td>
</lark-tr>
</lark-table>`,
want: `| line1<br/>line2 | single |
| --- | --- |`,
},
{
name: "table with merged cells not converted",
input: `<lark-table>
<lark-tr>
<lark-td colspan="2">merged</lark-td>
</lark-tr>
</lark-table>`,
want: `<lark-table>
<lark-tr>
<lark-td colspan="2">merged</lark-td>
</lark-tr>
</lark-table>`,
},
{
name: "table inside code block not converted",
input: "```\n<lark-table>\n<lark-tr>\n<lark-td>cell</lark-td>\n</lark-tr>\n</lark-table>\n```",
want: "```\n<lark-table>\n<lark-tr>\n<lark-td>cell</lark-td>\n</lark-tr>\n</lark-table>\n```",
},
{
name: "multiple tables in document",
input: `# Title

<lark-table>
<lark-tr>
<lark-td>A</lark-td>
</lark-tr>
</lark-table>

More text.

<lark-table>
<lark-tr>
<lark-td>B</lark-td>
</lark-tr>
</lark-table>`,
want: `# Title

| A |
| --- |

More text.

| B |
| --- |`,
},
{
name: "table with whitespace in cells",
input: `<lark-table>
<lark-tr>
<lark-td>
content
</lark-td>
</lark-tr>
</lark-table>`,
want: `| content |
| --- |`,
},
{
name: "table with attributes on tags",
input: `<lark-table id="tbl1">
<lark-tr class="row">
<lark-td style="bold">Data</lark-td>
</lark-tr>
</lark-table>`,
want: `| Data |
| --- |`,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := applyOutsideCodeFences(tt.input, fixLarkTables)
if got != tt.want {
t.Errorf("fixLarkTables(%q) = %q, want %q", tt.input, got, tt.want)
}
})
}
}

func TestFixLarkTablesIntegrated(t *testing.T) {
// Test that fixLarkTables is applied in fixExportedMarkdown
input := `# Document

<lark-table>
<lark-tr>
<lark-td>Header</lark-td>
</lark-tr>
<lark-tr>
<lark-td>Data</lark-td>
</lark-tr>
</lark-table>

End.`
result := fixExportedMarkdown(input)

// Should contain markdown table format
if !strings.Contains(result, "| Header |") {
t.Error("expected markdown table header in output")
}
if !strings.Contains(result, "| --- |") {
t.Error("expected markdown table separator in output")
}
if !strings.Contains(result, "| Data |") {
t.Error("expected markdown table data row in output")
}
// Should NOT contain XML tags
if strings.Contains(result, "<lark-table>") {
t.Error("expected lark-table XML to be converted")
}
}
Loading