Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions actions/setup/js/sanitize_content.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ const {
neutralizeGitHubReferences,
removeXmlComments,
convertXmlTags,
applyToNonCodeRegions,
neutralizeBotTriggers,
applyTruncation,
hardenUnicodeText,
Expand Down Expand Up @@ -90,11 +91,11 @@ function sanitizeContent(content, maxLengthOrOptions) {
// Neutralize @mentions with selective filtering (custom logic for allowed aliases)
sanitized = neutralizeMentions(sanitized, allowedAliasesLowercase);

// Remove XML comments
sanitized = removeXmlComments(sanitized);
// Remove XML comments – skip code blocks and inline code
sanitized = applyToNonCodeRegions(sanitized, removeXmlComments);

// Convert XML tags
sanitized = convertXmlTags(sanitized);
// Convert XML tags – skip code blocks and inline code
sanitized = applyToNonCodeRegions(sanitized, convertXmlTags);

// URI filtering (shared with core)
sanitized = sanitizeUrlProtocols(sanitized);
Expand Down
75 changes: 75 additions & 0 deletions actions/setup/js/sanitize_content.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,81 @@ describe("sanitize_content.cjs", () => {
});
});

describe("XML/HTML tag conversion: code-region awareness", () => {
it("should preserve angle brackets inside fenced code blocks (backticks)", () => {
const input = "Before\n```\nVBuffer<float32> x;\n```\nAfter";
const result = sanitizeContent(input);
expect(result).toContain("VBuffer<float32>");
expect(result).not.toContain("VBuffer(float32)");
});

it("should preserve angle brackets inside fenced code blocks (tildes)", () => {
const input = "Before\n~~~\nfoo<int> bar;\n~~~\nAfter";
const result = sanitizeContent(input);
expect(result).toContain("foo<int>");
expect(result).not.toContain("foo(int)");
});

it("should preserve angle brackets inside inline code spans", () => {
const result = sanitizeContent("Use `VBuffer<float32>` for vectors");
expect(result).toContain("`VBuffer<float32>`");
expect(result).not.toContain("VBuffer(float32)");
});

it("should still convert angle brackets in regular text", () => {
const result = sanitizeContent("Watch out for <script>alert(1)</script> here");
expect(result).toContain("(script)");
expect(result).not.toContain("<script>");
});

it("should handle mixed content: code block with tags and regular text with tags", () => {
const input = "Normal: <div>bad</div>\n```\n<div>safe code</div>\n```\nNormal again: <img src=x>";
const result = sanitizeContent(input);
// Regular text: tags converted
expect(result).toContain("(div)bad(/div)");
// Code block: tags preserved
expect(result).toContain("<div>safe code</div>");
// Regular text after block: tags converted
expect(result).toContain("(img src=x)");
});

it("should handle a fenced block with a language specifier", () => {
const input = "```typescript\nconst arr: Array<string> = [];\n```";
const result = sanitizeContent(input);
expect(result).toContain("Array<string>");
expect(result).not.toContain("Array(string)");
});

it("should preserve XML comments inside fenced code blocks", () => {
const input = "```xml\n<!-- comment -->\n<tag>value</tag>\n```";
const result = sanitizeContent(input);
expect(result).toContain("<!-- comment -->");
expect(result).toContain("<tag>value</tag>");
});

it("should still remove XML comments outside code blocks", () => {
const result = sanitizeContent("text <!-- remove me --> end");
expect(result).not.toContain("<!-- remove me -->");
expect(result).toContain("text");
expect(result).toContain("end");
});

it("should preserve inline code with multiple backticks", () => {
const result = sanitizeContent("Use ``VBuffer<float32>`` inline");
expect(result).toContain("``VBuffer<float32>``");
expect(result).not.toContain("VBuffer(float32)");
});

it("should handle issue title example: VBuffer<float32>", () => {
// Simulates a title where type parameters are in inline code
const result = sanitizeContent("Support for `VBuffer<float32>` and `VBuffer<float>`");
expect(result).toContain("`VBuffer<float32>`");
expect(result).toContain("`VBuffer<float>`");
expect(result).not.toContain("VBuffer(float32)");
expect(result).not.toContain("VBuffer(float)");
});
Copy link

Copilot AI Mar 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new tests cover top-level fenced blocks and inline code spans, but they don’t cover fenced code blocks inside blockquotes (e.g. > ```yaml) or list items—both are common markdown patterns and currently not detected by getFencedCodeRanges. Adding coverage for these cases would prevent regressions where angle brackets are still converted inside those code blocks.

Suggested change
});
});
it("should treat fenced code blocks inside blockquotes as code regions", () => {
const markdown = [
"> ```yaml",
"> apiVersion: v1",
"> kind: Pod<V1>",
"> ```",
].join("\n");
const result = sanitizeContent(markdown);
expect(result).toContain("kind: Pod<V1>");
expect(result).not.toContain("kind: Pod(V1)");
});
it("should treat fenced code blocks inside list items as code regions", () => {
const markdown = [
"- ```csharp",
" var list = new List<string>();",
" ```",
].join("\n");
const result = sanitizeContent(markdown);
expect(result).toContain("List<string>");
expect(result).not.toContain("List(string)");
});

Copilot uses AI. Check for mistakes.
});

describe("ANSI escape sequence removal", () => {
it("should remove ANSI color codes", () => {
const result = sanitizeContent("\x1b[31mred text\x1b[0m");
Expand Down
190 changes: 186 additions & 4 deletions actions/setup/js/sanitize_content_core.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,186 @@ function neutralizeAllMentions(s) {
});
}

/**
* Returns the character ranges [start, end) of fenced code blocks in markdown content.
* Fenced code blocks are delimited by lines starting with 3+ backticks or 3+ tildes.
* The returned ranges span from the first character of the opening fence line through
* the last character of the closing fence line (inclusive of any trailing newline).
*
* @param {string} s - Markdown content to scan
* @returns {Array<[number, number]>} Array of [start, end) character positions
*/
function getFencedCodeRanges(s) {
/** @type {Array<[number, number]>} */
const ranges = [];
const lines = s.split("\n");
let pos = 0;
let inBlock = false;
let blockStart = -1;
let fenceChar = "";
let fenceLen = 0;

for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const trimmed = line.trim();
// Character position of the end of this line's content (not including the newline separator)
const lineContentEnd = pos + line.length;
// Character position after the newline separator (or same as lineContentEnd for the last line)
const lineEnd = i < lines.length - 1 ? lineContentEnd + 1 : lineContentEnd;

if (!inBlock) {
const m = trimmed.match(/^(`{3,}|~{3,})/);
if (m) {
inBlock = true;
blockStart = pos;
fenceChar = m[1][0];
fenceLen = m[1].length;
}
Comment on lines +344 to +351
Copy link

Copilot AI Mar 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getFencedCodeRanges only recognizes fences when the trimmed line starts with backticks/tildes. This misses valid fenced code blocks inside blockquotes (e.g. > ```yaml) and list items (e.g. 1. ````, - ````), so XML tag conversion/comment removal will still run inside those code blocks and continue mangling code samples in those common markdown constructs. Consider extending the fence detection to handle >-prefixed blockquote lines and list-item prefixes (or switching to a markdown fence parser that understands these cases).

Copilot uses AI. Check for mistakes.
} else {
// A closing fence: same character, at least as long, only whitespace after
const fc = fenceChar === "`" ? "\\`" : "~";
const closingRegex = new RegExp(`^[${fc}]{${fenceLen},}\\s*$`);
if (closingRegex.test(trimmed)) {
ranges.push([blockStart, lineEnd]);
inBlock = false;
blockStart = -1;
fenceChar = "";
fenceLen = 0;
}
}

pos = lineEnd;
}

// Unclosed fence – treat the rest as code (safer fallback)
if (inBlock && blockStart !== -1) {
ranges.push([blockStart, s.length]);
}

return ranges;
}

/**
* Applies a transformation function to a text segment while skipping inline code spans
* (backtick-delimited sequences). The transformation is applied to each run of
* non-code text; inline code spans are preserved verbatim.
*
* @param {string} text - The text to process (should not contain fenced code blocks)
* @param {function(string): string} fn - Transformation to apply to non-code portions
* @returns {string} The processed text
*/
function applyFnOutsideInlineCode(text, fn) {
if (!text) return fn(text || "");

const parts = [];
let i = 0;
let textStart = 0;

while (i < text.length) {
if (text[i] !== "`") {
i++;
continue;
}

// Count consecutive backticks at the current position
const btStart = i;
let btCount = 0;
while (i < text.length && text[i] === "`") {
btCount++;
i++;
}
// i is now past the opening backtick sequence

// Look for the matching closing sequence of exactly btCount backticks
let closeIdx = -1;
let j = i;
while (j < text.length) {
if (text[j] === "`") {
let closeCount = 0;
const jStart = j;
while (j < text.length && text[j] === "`") {
closeCount++;
j++;
}
if (closeCount === btCount) {
closeIdx = jStart;
break;
}
// Different length – keep scanning (j already advanced past these backticks)
} else {
j++;
}
}

if (closeIdx !== -1) {
// Valid inline code span found: apply fn to the text before it, then keep the code span
if (textStart < btStart) {
parts.push(fn(text.slice(textStart, btStart)));
}
parts.push(text.slice(btStart, closeIdx + btCount));
textStart = closeIdx + btCount;
i = textStart;
}
// If no matching close was found, the backticks are treated as regular text (i already advanced)
Comment on lines +392 to +437
Copy link

Copilot AI Mar 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

applyFnOutsideInlineCode can become O(n²) for pathological inputs with many unmatched/differently-sized backtick runs because for each opener it linearly scans the remainder of the string to find a matching closer. Given the sanitizer allows up to ~524k chars, this can become a noticeable CPU hotspot / potential DoS vector. Consider rewriting this as a single-pass state machine that advances j monotonically (or using a regex-based tokenizer) so the overall runtime is O(n).

Suggested change
while (i < text.length) {
if (text[i] !== "`") {
i++;
continue;
}
// Count consecutive backticks at the current position
const btStart = i;
let btCount = 0;
while (i < text.length && text[i] === "`") {
btCount++;
i++;
}
// i is now past the opening backtick sequence
// Look for the matching closing sequence of exactly btCount backticks
let closeIdx = -1;
let j = i;
while (j < text.length) {
if (text[j] === "`") {
let closeCount = 0;
const jStart = j;
while (j < text.length && text[j] === "`") {
closeCount++;
j++;
}
if (closeCount === btCount) {
closeIdx = jStart;
break;
}
// Different length – keep scanning (j already advanced past these backticks)
} else {
j++;
}
}
if (closeIdx !== -1) {
// Valid inline code span found: apply fn to the text before it, then keep the code span
if (textStart < btStart) {
parts.push(fn(text.slice(textStart, btStart)));
}
parts.push(text.slice(btStart, closeIdx + btCount));
textStart = closeIdx + btCount;
i = textStart;
}
// If no matching close was found, the backticks are treated as regular text (i already advanced)
// To avoid O(n^2) behavior when scanning for matching backtick runs,
// we pre-scan the string once to find all backtick runs, then pair them
// by length in a single pass. This preserves the original semantics:
// for a given run length, the first occurrence opens a span, the next
// closes it, and so on, ignoring intervening runs of different lengths.
/** @type {{ start: number, length: number }[]} */
const runs = [];
for (let idx = 0; idx < text.length;) {
if (text[idx] !== "`") {
idx++;
continue;
}
const start = idx;
let count = 0;
while (idx < text.length && text[idx] === "`") {
count++;
idx++;
}
runs.push({ start, length: count });
}
/**
* Map from backtick run length to the start index of the currently
* pending opener (if any) for that length.
* @type {Record<string, number | undefined>}
*/
const pending = Object.create(null);
/** @type {{ openStart: number, openLen: number, closeStart: number }[]} */
const spans = [];
for (const run of runs) {
const lenKey = String(run.length);
const prev = pending[lenKey];
if (prev === undefined) {
// No pending opener for this length: mark this run as an opener.
pending[lenKey] = run.start;
} else {
// Found a closer for the existing opener of this length.
spans.push({
openStart: prev,
openLen: run.length,
closeStart: run.start,
});
pending[lenKey] = undefined;
}
}
// spans are generated in the order in which their closing runs appear.
// Since each openStart is always before its closeStart and runs are
// iterated in increasing order of start, spans are effectively ordered
// by appearance in the text and do not overlap for a given length.
// We now walk through the spans, emitting processed text and raw code.
for (const span of spans) {
const btStart = span.openStart;
const btCount = span.openLen;
const closeIdx = span.closeStart;
// Skip spans that start before the current textStart (e.g., if any
// earlier spans have already advanced textStart past them).
if (btStart < textStart) {
continue;
}
// Apply fn to non-code text before this code span.
if (textStart < btStart) {
parts.push(fn(text.slice(textStart, btStart)));
}
// Preserve the code span (including backticks) unchanged.
parts.push(text.slice(btStart, closeIdx + btCount));
textStart = closeIdx + btCount;

Copilot uses AI. Check for mistakes.
}

// Apply fn to any remaining non-code text
if (textStart < text.length) {
parts.push(fn(text.slice(textStart)));
}

return parts.join("");
}

/**
* Applies a transformation function only to the non-code regions of markdown content.
* Skips both fenced code blocks (``` / ~~~ delimited) and inline code spans (backtick
* delimited) so that the transformation is not applied to code content.
*
* Falls back to applying fn to the entire string if any parsing error occurs.
*
* @param {string} s - Markdown content to process
* @param {function(string): string} fn - Transformation to apply outside code regions
* @returns {string} The content with the transformation applied only outside code regions
*/
function applyToNonCodeRegions(s, fn) {
if (!s || typeof s !== "string") {
return s || "";
}
Comment on lines +459 to +462
Copy link

Copilot AI Mar 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The JSDoc for applyToNonCodeRegions says it returns a string, but when s is truthy and not a string (e.g. Buffer/object) it returns s as-is (return s || ""). Since this helper is exported, that return-type mismatch can leak non-strings to callers. Consider always returning a string here (e.g. "" for non-strings) or updating the JSDoc/type contract accordingly.

Copilot uses AI. Check for mistakes.

try {
const codeRanges = getFencedCodeRanges(s);

if (codeRanges.length === 0) {
// No fenced code blocks – still protect inline code spans
return applyFnOutsideInlineCode(s, fn);
}

const parts = [];
let pos = 0;

for (const [start, end] of codeRanges) {
if (pos < start) {
// Non-code text before this code block: protect inline code spans
parts.push(applyFnOutsideInlineCode(s.slice(pos, start), fn));
}
// Fenced code block: preserve verbatim
parts.push(s.slice(start, end));
pos = end;
}

// Non-code text after the last code block
if (pos < s.length) {
parts.push(applyFnOutsideInlineCode(s.slice(pos), fn));
}

return parts.join("");
} catch (_e) {
// Fallback: apply fn to the entire string (conservative – redacts more, never less)
return fn(s);
}
}

/**
* Removes XML comments from content
* @param {string} s - The string to process
Expand Down Expand Up @@ -783,11 +963,12 @@ function sanitizeContentCore(content, maxLength, maxBotMentions) {
// Neutralize ALL @mentions (no filtering in core version)
sanitized = neutralizeAllMentions(sanitized);

// Remove XML comments first
sanitized = removeXmlComments(sanitized);
// Remove XML comments – skip code blocks and inline code to avoid altering code content
sanitized = applyToNonCodeRegions(sanitized, removeXmlComments);

// Convert XML tags to parentheses format to prevent injection
sanitized = convertXmlTags(sanitized);
// Convert XML tags to parentheses format – skip code blocks and inline code so that
// type parameters (e.g. VBuffer<float32>) and code containing angle brackets are preserved
sanitized = applyToNonCodeRegions(sanitized, convertXmlTags);

Comment on lines +966 to 972
Copy link

Copilot AI Mar 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

applyToNonCodeRegions determines code regions before balanceCodeRegions runs later in the pipeline. If balanceCodeRegions modifies fence lengths / closes unclosed fences (a scenario this repo explicitly expects for AI-generated markdown), content that ends up inside a code block after balancing may still have had XML comment removal / tag conversion applied earlier, reintroducing the original mangling for malformed markdown. Consider balancing code regions before running the code-aware XML transforms, or running the XML transforms on the balanced markdown so both stages agree on code boundaries.

Copilot uses AI. Check for mistakes.
// URI filtering - replace non-https protocols with "(redacted)"
sanitized = sanitizeUrlProtocols(sanitized);
Expand Down Expand Up @@ -834,6 +1015,7 @@ module.exports = {
neutralizeGitHubReferences,
removeXmlComments,
convertXmlTags,
applyToNonCodeRegions,
neutralizeBotTriggers,
MAX_BOT_TRIGGER_REFERENCES,
neutralizeTemplateDelimiters,
Expand Down
Loading