From bf4bd31736656e472483cb19519bd760fd60815e Mon Sep 17 00:00:00 2001 From: thaafox <191024423+thaafox@users.noreply.github.com> Date: Tue, 24 Mar 2026 18:21:31 -0400 Subject: [PATCH] Add emoji display width support and width-aware string truncation mk_wcwidth() returns 0 for emoji codepoints (U+1F300-U+1FAFF) because they fall through as unassigned. Modern terminals render these as 2-cell-wide characters, causing column misalignment in any application that relies on mk_wcwidth() for layout. Add explicit width mappings: - Emoji pictographs (U+1F300-U+1F9FF, U+1FA00-U+1FAFF): width 2 - Arrows, geometric shapes, dingbats, misc symbols: width 1 - Variation selectors (U+FE00-U+FE0F): width 0 Add utf8_truncate_to_width() which truncates a UTF-8 string to fit within a target number of display columns, ensuring multi-byte sequences are never split mid-character. --- src/utf8.cpp | 42 ++++++++++++++++++++++++++++++++++++++++++ src/utf8.h | 2 ++ 2 files changed, 44 insertions(+) diff --git a/src/utf8.cpp b/src/utf8.cpp index 347f326..809d471 100644 --- a/src/utf8.cpp +++ b/src/utf8.cpp @@ -299,6 +299,31 @@ std::string utf8_substr ( return result; } +//////////////////////////////////////////////////////////////////////////////// +// Truncate a UTF-8 string to fit within a target display width. +// Unlike substr which counts characters, this counts display columns. +const std::string utf8_truncate_to_width ( + const std::string& input, + unsigned int target_width) +{ + unsigned int current_width = 0; + std::string::size_type i = 0; + std::string::size_type last_safe = 0; + unsigned int c; + + while ((c = utf8_next_char (input, i))) + { + int w = mk_wcwidth (c); + if (w < 0) w = 0; + if (current_width + w > target_width) + break; + current_width += w; + last_safe = i; + } + + return input.substr (0, last_safe); +} + //////////////////////////////////////////////////////////////////////////////// int mk_wcwidth(wchar_t ucs) { @@ -316,6 +341,23 @@ int mk_wcwidth(wchar_t ucs) if (width == widechar_ambiguous) return 1; + // Emoji pictographs (U+1F300+) — width 2 in modern terminals + if ((ucs >= 0x1F300 && ucs <= 0x1F9FF) || // Misc Symbols, Pictographs, Emoticons, Supplemental + (ucs >= 0x1FA00 && ucs <= 0x1FAFF)) // Symbols and Pictographs Extended-A + return 2; + + // Common symbols — width 1 in modern terminals + if ((ucs >= 0x2190 && ucs <= 0x21FF) || // Arrows + (ucs >= 0x2300 && ucs <= 0x23FF) || // Misc Technical + (ucs >= 0x25A0 && ucs <= 0x25FF) || // Geometric Shapes + (ucs >= 0x2600 && ucs <= 0x26FF) || // Misc Symbols + (ucs >= 0x2700 && ucs <= 0x27BF)) // Dingbats + return 1; + + // Variation selectors — zero width + if (ucs >= 0xFE00 && ucs <= 0xFE0F) + return 0; + // All other negative values return 0; } diff --git a/src/utf8.h b/src/utf8.h index ae720de..e9b4a3d 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -39,6 +39,8 @@ unsigned int utf8_width (const std::string& str); unsigned int utf8_text_width (const std::string&); std::string utf8_substr (const std::string&, unsigned int, unsigned int length = 0); +const std::string utf8_truncate_to_width (const std::string&, unsigned int target_width); + int mk_wcwidth (wchar_t); #endif