diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index d2b6643c700e88..75b68b6997b3ad 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -328,6 +328,67 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. +.. c:function:: Py_ssize_t PyUnstable_UCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be lowercased into a buffer of *size* ``2``. + See also :c:macro:`PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnstable_UCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to upper case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be upper cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be uppercased into a buffer of *size* ``3``. + See also :c:macro:`PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnstable_UCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to title case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be title cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be titlecased into a buffer of *size* ``3``. + See also :c:macro:`PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnstable_UCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Foldcase *ch*, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be foldcased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be foldcased into a buffer of *size* ``3``. + See also :c:macro:`PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + +.. c:macro:: PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE + + The minimum buffer size needed for any call to + :c:func:`PyUnstable_UCS4_ToLower`, :c:func:`PyUnstable_UCS4_ToUpper`, + :c:func:`PyUnstable_UCS4_ToTitle`, or :c:func:`PyUnstable_UCS4_ToFolded`. + That is, ``3`` for Unicode 16.0. + +.. versionadded:: next + + These APIs can be used to work with surrogates: .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 631a6570658410..75a3702bcf35af 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -732,6 +732,31 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); +PyAPI_FUNC(Py_ssize_t) PyUnstable_UCS4_ToLower( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUnstable_UCS4_ToUpper( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUnstable_UCS4_ToTitle( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUnstable_UCS4_ToFolded( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; @@ -766,6 +791,8 @@ static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) { #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) +#define PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE 3 + static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) { return (Py_UNICODE_ISALPHA(ch) || Py_UNICODE_ISDECIMAL(ch) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 6a9c60f3a6d75e..3a5d1a0053f351 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1753,6 +1753,65 @@ def test_GET_CACHED_HASH(self): # impl detail: ASCII string hashes are equal to bytes ones self.assertEqual(unicode_GET_CACHED_HASH(obj), hash(content_bytes)) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tolower(self): + import string + from _testcapi import unicode_tolower + + for i, c in enumerate(string.ascii_uppercase): + with self.subTest(c): + self.assertEqual(unicode_tolower(c), string.ascii_lowercase[i]) + + # Test unicode character + self.assertEqual(unicode_tolower("Č"), "č") + self.assertEqual(unicode_tolower("Σ"), "σ") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_toupper(self): + import string + from _testcapi import unicode_toupper, unicode_toupper_buffer_too_small + + for i, c in enumerate(string.ascii_lowercase): + with self.subTest(c): + self.assertEqual(unicode_toupper(c), string.ascii_uppercase[i]) + + # Test unicode character + self.assertEqual(unicode_toupper("č"), "Č") + self.assertEqual(unicode_toupper("ß"), "SS") + self.assertEqual(unicode_toupper("ΐ"), "Ϊ́") + + # Test unicode character with smaller buffer + with self.assertRaisesRegex(ValueError, "output buffer is too small"): + unicode_toupper_buffer_too_small("ß") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_totitle(self): + from _testcapi import unicode_totitle + + self.assertEqual(unicode_totitle("t"), "T") + + # Test unicode character + self.assertEqual(unicode_totitle("ł"), "Ł") + self.assertEqual(unicode_totitle("ß"), "Ss") + self.assertEqual(unicode_totitle("ΐ"), "Ϊ́") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tofolded(self): + from _testcapi import unicode_tofolded + + self.assertEqual(unicode_tofolded("T"), "t") + + # Test unicode character + self.assertEqual(unicode_tofolded("Ł"), "ł") + self.assertEqual(unicode_tofolded("Σ"), "σ") + + # Test case-ignorable character + self.assertEqual(unicode_tofolded("👍"), "👍") + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): diff --git a/Misc/NEWS.d/next/C_API/2026-02-12-17-36-07.gh-issue-76535.N2pwE4.rst b/Misc/NEWS.d/next/C_API/2026-02-12-17-36-07.gh-issue-76535.N2pwE4.rst new file mode 100644 index 00000000000000..33b312d1eb25ea --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2026-02-12-17-36-07.gh-issue-76535.N2pwE4.rst @@ -0,0 +1,2 @@ +Add unstable C APIs for lower-casing, upper-casing, title-casing and +casefolding single unicode codepoints. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 203282dd53dd0a..d48199f30e1a65 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,6 +220,75 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } +static PyObject * +unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t), + Py_UCS4 *buf, Py_ssize_t size) +{ + if (!PyUnicode_Check(str)) { + PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); + return NULL; + } + + if (PyUnicode_GET_LENGTH(str) != 1) { + PyErr_SetString(PyExc_ValueError, "expecting 1-character strings only"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); + + Py_ssize_t chars = function(c, buf, size); + if (chars < 0) { + return NULL; + } + + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf, chars); +} + +/* Test PyUnstable_UCS4_ToLower() */ +static PyObject * +unicode_tolower(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUnstable_UCS4_ToLower, + buf, PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE); +} + + +/* Test PyUnstable_UCS4_ToUpper() */ +static PyObject * +unicode_toupper(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUnstable_UCS4_ToUpper, + buf, PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE); +} + +/* Test PyUnstable_UCS4_ToUpper() with a small buffer */ +static PyObject * +unicode_toupper_buffer_too_small(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf; + return unicode_case_operation(arg, PyUnstable_UCS4_ToUpper, &buf, 1); +} + +/* Test PyUnstable_UCS4_ToTitle() */ +static PyObject * +unicode_totitle(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUnstable_UCS4_ToTitle, + buf, PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE); +} + +/* Test PyUnstable_UCS4_ToFolded() */ +static PyObject * +unicode_tofolded(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUnstable_UCS4_ToFolded, buf, PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE); +} + + static PyObject* unicode_GET_CACHED_HASH(PyObject *self, PyObject *arg) { @@ -577,6 +646,11 @@ static PyMethodDef TestMethods[] = { {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, {"unicode_GET_CACHED_HASH", unicode_GET_CACHED_HASH, METH_O}, + {"unicode_tolower", unicode_tolower, METH_O}, + {"unicode_toupper", unicode_toupper, METH_O}, + {"unicode_toupper_buffer_too_small", unicode_toupper_buffer_too_small, METH_O}, + {"unicode_totitle", unicode_totitle, METH_O}, + {"unicode_tofolded", unicode_tofolded, METH_O}, {NULL}, }; diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index fdd380190ac1ec..7f39696a1a787c 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -199,6 +199,32 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } +Py_ssize_t PyUnstable_UCS4_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->lower & 0xFFFF; + int n = ctype->lower >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + res[0] = ch + ctype->lower; + return 1; +} + int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -215,6 +241,32 @@ int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) return 1; } +Py_ssize_t PyUnstable_UCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->title & 0xFFFF; + int n = ctype->title >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + res[0] = ch + ctype->title; + return 1; +} + int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -231,6 +283,32 @@ int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) return 1; } +Py_ssize_t PyUnstable_UCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->upper & 0xFFFF; + int n = ctype->upper >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + res[0] = ch + ctype->upper; + return 1; +} + int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -247,6 +325,27 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) return 1; } +Py_ssize_t PyUnstable_UCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { + int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); + int n = (ctype->lower >> 20) & 7; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + + return PyUnstable_UCS4_ToLower(ch, res, size); +} + int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);