From c538bdf0d74ada228a87a2a3ddfc29b5d93edf45 Mon Sep 17 00:00:00 2001 From: hobostay Date: Mon, 23 Mar 2026 15:00:45 +0800 Subject: [PATCH] fix: validate Unicode codepoints in utf8_encode() The Unicode standard defines the maximum valid codepoint as U+10FFFF. Codepoints above this value are invalid and produce malformed UTF-8 sequences. This patch adds validation to replace out-of-range codepoints with the Unicode replacement character U+FFFD. This follows RFC 3629 which restricted UTF-8 to encode no more than U+10FFFF to avoid UTF-16 surrogate pairs and maintain consistency with the Unicode standard. Co-Authored-By: Claude Opus 4.6 (1M context) --- main.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/main.c b/main.c index 2a5cf81..3e0a671 100644 --- a/main.c +++ b/main.c @@ -246,8 +246,19 @@ static uint32_t raylib_key_unshifted_codepoint(int rl_key) // Encode a single Unicode codepoint into a UTF-8 byte buffer. // Returns the number of bytes written (1–4). +// Invalid codepoints (> U+10FFFF) are replaced with U+FFFD. static int utf8_encode(uint32_t cp, char out[4]) { + // Unicode defines the maximum valid codepoint as U+10FFFF. + // Codepoints above this value are invalid and should be replaced + // with the Unicode replacement character U+FFFD. + const uint32_t MAX_UNICODE = 0x10FFFF; + const uint32_t REPLACEMENT_CHAR = 0xFFFD; + + if (cp > MAX_UNICODE) { + cp = REPLACEMENT_CHAR; + } + if (cp < 0x80) { out[0] = (char)cp; return 1;