From 1adc6aa366e38d8626b49118323bfec2b3805af6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 12:05:51 +0000 Subject: [PATCH 1/3] Initial plan From 9335458c7f3ea2cefab8b4309f2456b7b73c9127 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 12:29:00 +0000 Subject: [PATCH 2/3] fix: strip Unicode Tag Characters (U+E0020-U+E007F) in hardenUnicodeText MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These Cf-category codepoints (Plane 14) have exact 1:1 ASCII equivalents and are completely invisible in all standard renderers including GitHub Markdown. A prompt-injected agent could encode any payload in Tag Characters and post it via the safe-outputs write path — it would appear blank to human reviewers but decode 1:1 for downstream consumers. Add Step 3b to hardenUnicodeText() that strips the full Tag Characters block (surrogate pairs \uDB40\uDC00–\uDB40\uDC7F) and add 7 regression tests covering boundary codepoints, full payload encoding, mixed text, and the @mention-bypass vector using Tag Characters. Agent-Logs-Url: https://github.com/github/gh-aw/sessions/a54c95e5-86a7-4979-8835-18ab24cfd2c2 Co-authored-by: szabta89 <1330202+szabta89@users.noreply.github.com> --- actions/setup/js/sanitize_content.test.cjs | 50 ++++++++++++++++++++++ actions/setup/js/sanitize_content_core.cjs | 8 ++++ 2 files changed, 58 insertions(+) diff --git a/actions/setup/js/sanitize_content.test.cjs b/actions/setup/js/sanitize_content.test.cjs index 15ae3b886ec..cf67bf89ede 100644 --- a/actions/setup/js/sanitize_content.test.cjs +++ b/actions/setup/js/sanitize_content.test.cjs @@ -1745,6 +1745,56 @@ describe("sanitize_content.cjs", () => { }); }); + describe("Unicode Tag Characters removal (U+E0020–U+E007F, Plane 14)", () => { + it("should strip a single Tag Characters codepoint (U+E0041 = TAG LATIN CAPITAL LETTER A)", () => { + // \uDB40\uDC41 is the surrogate pair for U+E0041 + const input = "Hello\uDB40\uDC41World"; + expect(sanitizeContent(input)).toBe("HelloWorld"); + }); + + it("should strip LANGUAGE TAG (U+E0001) at the boundary of the Tag block", () => { + // \uDB40\uDC01 is the surrogate pair for U+E0001 + const input = "test\uDB40\uDC01"; + expect(sanitizeContent(input)).toBe("test"); + }); + + it("should strip CANCEL TAG (U+E007F) at the upper boundary of the Tag block", () => { + // \uDB40\uDC7F is the surrogate pair for U+E007F + const input = "\uDB40\uDC7Ftest"; + expect(sanitizeContent(input)).toBe("test"); + }); + + it("should strip a full ASCII string encoded in Tag Characters — invisible payload attack", () => { + // Encode "SECRET" using Tag Characters: each ASCII char C -> U+E0000+C + // S=0x53, E=0x45, C=0x43, R=0x52, E=0x45, T=0x54 + const tagS = "\uDB40\uDC53"; + const tagE = "\uDB40\uDC45"; + const tagC = "\uDB40\uDC43"; + const tagR = "\uDB40\uDC52"; + const tagT = "\uDB40\uDC54"; + const encoded = tagS + tagE + tagC + tagR + tagE + tagT; + expect(sanitizeContent(encoded)).toBe(""); + }); + + it("should strip Tag Characters mixed with normal ASCII text", () => { + // Tag-encoded 'A' (U+E0041) interspersed with normal letters + const input = "a\uDB40\uDC41b\uDB40\uDC42c"; + expect(sanitizeContent(input)).toBe("abc"); + }); + + it("should strip multiple adjacent Tag Characters", () => { + // TAG LATIN CAPITAL LETTER A through D (U+E0041–U+E0044) + const input = "\uDB40\uDC41\uDB40\uDC42\uDB40\uDC43\uDB40\uDC44"; + expect(sanitizeContent(input)).toBe(""); + }); + + it("should neutralize @mention bypass using Tag Characters between @ and username", () => { + // Inserting a Tag Character between @ and username to bypass mention detection + const input = "@\uDB40\uDC41admin please review"; + expect(sanitizeContent(input)).toBe("`@admin` please review"); + }); + }); + describe("@mention bypass prevention via invisible characters", () => { it("should neutralize @mention with U+200F (RTL mark) inserted between @ and username", () => { const input = "@\u200Fadmin please review"; diff --git a/actions/setup/js/sanitize_content_core.cjs b/actions/setup/js/sanitize_content_core.cjs index d7730cf46e3..5868a7fd2c6 100644 --- a/actions/setup/js/sanitize_content_core.cjs +++ b/actions/setup/js/sanitize_content_core.cjs @@ -1088,6 +1088,14 @@ function hardenUnicodeText(text) { // word joiner, and byte order mark result = result.replace(/[\u00AD\u034F\u200B\u200C\u200D\u200E\u200F\u2060\uFEFF]/g, ""); + // Step 3b: Strip Unicode Tag Characters block (U+E0000–U+E007F, Plane 14). + // These 128 Cf-category codepoints have exact 1:1 ASCII equivalents + // (e.g. U+E0041 = TAG LATIN CAPITAL LETTER A) and are completely invisible + // in all standard renderers including GitHub Markdown, enabling fully + // invisible prompt-injection payloads that decode 1:1 to ASCII content. + // Represented as surrogate pairs \uDB40\uDC00–\uDB40\uDC7F in JavaScript. + result = result.replace(/\uDB40[\uDC00-\uDC7F]/g, ""); + // Step 4: Remove bidirectional text override controls // These can be used to reverse text direction and create visual spoofs result = result.replace(/[\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]/g, ""); From 0f47b562cb64aa59326006a8983b273e1eb36c46 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 13:22:14 +0000 Subject: [PATCH 3/3] =?UTF-8?q?fix:=20correct=20describe=20block=20title?= =?UTF-8?q?=20to=20U+E0000=E2=80=93U+E007F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent-Logs-Url: https://github.com/github/gh-aw/sessions/8cb5f32b-51ab-4855-984b-0c72606e70f4 Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- actions/setup/js/sanitize_content.test.cjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/actions/setup/js/sanitize_content.test.cjs b/actions/setup/js/sanitize_content.test.cjs index cf67bf89ede..f35ed12c8cd 100644 --- a/actions/setup/js/sanitize_content.test.cjs +++ b/actions/setup/js/sanitize_content.test.cjs @@ -1745,7 +1745,7 @@ describe("sanitize_content.cjs", () => { }); }); - describe("Unicode Tag Characters removal (U+E0020–U+E007F, Plane 14)", () => { + describe("Unicode Tag Characters removal (U+E0000–U+E007F, Plane 14)", () => { it("should strip a single Tag Characters codepoint (U+E0041 = TAG LATIN CAPITAL LETTER A)", () => { // \uDB40\uDC41 is the surrogate pair for U+E0041 const input = "Hello\uDB40\uDC41World";