diff --git a/actions/setup/js/sanitize_content.test.cjs b/actions/setup/js/sanitize_content.test.cjs index 15ae3b886ec..f35ed12c8cd 100644 --- a/actions/setup/js/sanitize_content.test.cjs +++ b/actions/setup/js/sanitize_content.test.cjs @@ -1745,6 +1745,56 @@ describe("sanitize_content.cjs", () => { }); }); + describe("Unicode Tag Characters removal (U+E0000–U+E007F, Plane 14)", () => { + it("should strip a single Tag Characters codepoint (U+E0041 = TAG LATIN CAPITAL LETTER A)", () => { + // \uDB40\uDC41 is the surrogate pair for U+E0041 + const input = "Hello\uDB40\uDC41World"; + expect(sanitizeContent(input)).toBe("HelloWorld"); + }); + + it("should strip LANGUAGE TAG (U+E0001) at the boundary of the Tag block", () => { + // \uDB40\uDC01 is the surrogate pair for U+E0001 + const input = "test\uDB40\uDC01"; + expect(sanitizeContent(input)).toBe("test"); + }); + + it("should strip CANCEL TAG (U+E007F) at the upper boundary of the Tag block", () => { + // \uDB40\uDC7F is the surrogate pair for U+E007F + const input = "\uDB40\uDC7Ftest"; + expect(sanitizeContent(input)).toBe("test"); + }); + + it("should strip a full ASCII string encoded in Tag Characters — invisible payload attack", () => { + // Encode "SECRET" using Tag Characters: each ASCII char C -> U+E0000+C + // S=0x53, E=0x45, C=0x43, R=0x52, E=0x45, T=0x54 + const tagS = "\uDB40\uDC53"; + const tagE = "\uDB40\uDC45"; + const tagC = "\uDB40\uDC43"; + const tagR = "\uDB40\uDC52"; + const tagT = "\uDB40\uDC54"; + const encoded = tagS + tagE + tagC + tagR + tagE + tagT; + expect(sanitizeContent(encoded)).toBe(""); + }); + + it("should strip Tag Characters mixed with normal ASCII text", () => { + // Tag-encoded 'A' (U+E0041) interspersed with normal letters + const input = "a\uDB40\uDC41b\uDB40\uDC42c"; + expect(sanitizeContent(input)).toBe("abc"); + }); + + it("should strip multiple adjacent Tag Characters", () => { + // TAG LATIN CAPITAL LETTER A through D (U+E0041–U+E0044) + const input = "\uDB40\uDC41\uDB40\uDC42\uDB40\uDC43\uDB40\uDC44"; + expect(sanitizeContent(input)).toBe(""); + }); + + it("should neutralize @mention bypass using Tag Characters between @ and username", () => { + // Inserting a Tag Character between @ and username to bypass mention detection + const input = "@\uDB40\uDC41admin please review"; + expect(sanitizeContent(input)).toBe("`@admin` please review"); + }); + }); + describe("@mention bypass prevention via invisible characters", () => { it("should neutralize @mention with U+200F (RTL mark) inserted between @ and username", () => { const input = "@\u200Fadmin please review"; diff --git a/actions/setup/js/sanitize_content_core.cjs b/actions/setup/js/sanitize_content_core.cjs index d7730cf46e3..5868a7fd2c6 100644 --- a/actions/setup/js/sanitize_content_core.cjs +++ b/actions/setup/js/sanitize_content_core.cjs @@ -1088,6 +1088,14 @@ function hardenUnicodeText(text) { // word joiner, and byte order mark result = result.replace(/[\u00AD\u034F\u200B\u200C\u200D\u200E\u200F\u2060\uFEFF]/g, ""); + // Step 3b: Strip Unicode Tag Characters block (U+E0000–U+E007F, Plane 14). + // These 128 Cf-category codepoints have exact 1:1 ASCII equivalents + // (e.g. U+E0041 = TAG LATIN CAPITAL LETTER A) and are completely invisible + // in all standard renderers including GitHub Markdown, enabling fully + // invisible prompt-injection payloads that decode 1:1 to ASCII content. + // Represented as surrogate pairs \uDB40\uDC00–\uDB40\uDC7F in JavaScript. + result = result.replace(/\uDB40[\uDC00-\uDC7F]/g, ""); + // Step 4: Remove bidirectional text override controls // These can be used to reverse text direction and create visual spoofs result = result.replace(/[\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]/g, "");