From c538bdf0d74ada228a87a2a3ddfc29b5d93edf45 Mon Sep 17 00:00:00 2001
From: hobostay <hobostay@users.noreply.github.com>
Date: Mon, 23 Mar 2026 15:00:45 +0800
Subject: [PATCH] fix: validate Unicode codepoints in utf8_encode()

The Unicode standard defines the maximum valid codepoint as U+10FFFF.
Codepoints above this value are invalid and produce malformed UTF-8
sequences. This patch adds validation to replace out-of-range codepoints
with the Unicode replacement character U+FFFD.

This follows RFC 3629 which restricted UTF-8 to encode no more than
U+10FFFF to avoid UTF-16 surrogate pairs and maintain consistency
with the Unicode standard.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 main.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/main.c b/main.c
index 2a5cf81..3e0a671 100644
--- a/main.c
+++ b/main.c
@@ -246,8 +246,19 @@ static uint32_t raylib_key_unshifted_codepoint(int rl_key)
 
 // Encode a single Unicode codepoint into a UTF-8 byte buffer.
 // Returns the number of bytes written (1–4).
+// Invalid codepoints (> U+10FFFF) are replaced with U+FFFD.
 static int utf8_encode(uint32_t cp, char out[4])
 {
+    // Unicode defines the maximum valid codepoint as U+10FFFF.
+    // Codepoints above this value are invalid and should be replaced
+    // with the Unicode replacement character U+FFFD.
+    const uint32_t MAX_UNICODE = 0x10FFFF;
+    const uint32_t REPLACEMENT_CHAR = 0xFFFD;
+
+    if (cp > MAX_UNICODE) {
+        cp = REPLACEMENT_CHAR;
+    }
+
     if (cp < 0x80) {
         out[0] = (char)cp;
         return 1;