From 5de3f93ea05110088d83809575299ffd1a90c79d Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Fri, 28 Mar 2025 11:19:05 +0000 Subject: [PATCH 1/5] replace `PyString::from_object` with `PyString::from_encoded_object` --- newsfragments/5017.added.md | 1 + newsfragments/5017.changed.md | 1 + src/types/string.rs | 55 +++++++++++++++++++++++++++-------- 3 files changed, 45 insertions(+), 12 deletions(-) create mode 100644 newsfragments/5017.added.md create mode 100644 newsfragments/5017.changed.md diff --git a/newsfragments/5017.added.md b/newsfragments/5017.added.md new file mode 100644 index 00000000000..bf27988735e --- /dev/null +++ b/newsfragments/5017.added.md @@ -0,0 +1 @@ +Add `PyString::from_encoded_object`. diff --git a/newsfragments/5017.changed.md b/newsfragments/5017.changed.md new file mode 100644 index 00000000000..9d5f477cff7 --- /dev/null +++ b/newsfragments/5017.changed.md @@ -0,0 +1 @@ +Deprecate `PyString::from_object` in favour of `PyString::from_encoded_object`. diff --git a/src/types/string.rs b/src/types/string.rs index a389f0df234..11f76d5026b 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -10,7 +10,7 @@ use crate::types::PyBytes; use crate::IntoPy; use crate::{ffi, Bound, Py, PyAny, PyResult, Python}; use std::borrow::Cow; -use std::ffi::CString; +use std::ffi::{CStr, CString}; use std::str; /// Deprecated alias for [`PyString`]. @@ -212,6 +212,26 @@ impl PyString { /// Attempts to create a Python string from a Python [bytes-like object]. /// /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object). + pub fn from_encoded_object<'py>( + src: &Bound<'py, PyAny>, + encoding: &CStr, + errors: &CStr, + ) -> PyResult> { + unsafe { + ffi::PyUnicode_FromEncodedObject(src.as_ptr(), encoding.as_ptr(), errors.as_ptr()) + .assume_owned_or_err(src.py()) + .downcast_into_unchecked() + } + } + + /// Deprecated form of `PyString::from_encoded_object`. + /// + /// This version took `&str` arguments for `encoding` and `errors`, which required a runtime + /// conversion to `CString` internally. + #[deprecated( + since = "0.25.0", + note = "replaced with to `PyString::from_encoded_object`" + )] pub fn from_object<'py>( src: &Bound<'py, PyAny>, encoding: &str, @@ -219,25 +239,21 @@ impl PyString { ) -> PyResult> { let encoding = CString::new(encoding)?; let errors = CString::new(errors)?; - unsafe { - ffi::PyUnicode_FromEncodedObject( - src.as_ptr(), - encoding.as_ptr().cast(), - errors.as_ptr().cast(), - ) - .assume_owned_or_err(src.py()) - .downcast_into_unchecked() - } + PyString::from_encoded_object(src, &encoding, &errors) } /// Deprecated name for [`PyString::from_object`]. - #[deprecated(since = "0.23.0", note = "renamed to `PyString::from_object`")] + #[deprecated( + since = "0.23.0", + note = "replcaed with `PyString::from_encoded_object`" + )] #[inline] pub fn from_object_bound<'py>( src: &Bound<'py, PyAny>, encoding: &str, errors: &str, ) -> PyResult> { + #[allow(deprecated)] Self::from_object(src, encoding, errors) } } @@ -586,6 +602,8 @@ impl PartialEq> for &'_ str { #[cfg(test)] mod tests { + use pyo3_ffi::c_str; + use super::*; use crate::{IntoPyObject, PyObject}; @@ -678,6 +696,14 @@ mod tests { Python::with_gil(|py| { let py_bytes = PyBytes::new(py, b"ab\xFFcd"); + let py_string = + PyString::from_encoded_object(&py_bytes, c_str!("utf-8"), c_str!("ignore")) + .unwrap(); + + let result = py_string.to_cow().unwrap(); + assert_eq!(result, "abcd"); + + #[allow(deprecated)] let py_string = PyString::from_object(&py_bytes, "utf-8", "ignore").unwrap(); let result = py_string.to_cow().unwrap(); @@ -686,13 +712,18 @@ mod tests { } #[test] - fn test_string_from_obect_with_invalid_encoding_errors() { + fn test_string_from_object_with_invalid_encoding_errors() { Python::with_gil(|py| { let py_bytes = PyBytes::new(py, b"abcd"); + let result = PyString::from_encoded_object(&py_bytes, c_str!("wat"), c_str!("ignore")); + assert!(result.is_err()); + + #[allow(deprecated)] let result = PyString::from_object(&py_bytes, "utf\0-8", "ignore"); assert!(result.is_err()); + #[allow(deprecated)] let result = PyString::from_object(&py_bytes, "utf-8", "ign\0ore"); assert!(result.is_err()); }); From 5751b593207b3086a93e3f025bf70c45bf47870e Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Fri, 25 Jul 2025 19:21:38 +0100 Subject: [PATCH 2/5] accept `Option` to allow default values --- src/types/string.rs | 73 +++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/src/types/string.rs b/src/types/string.rs index 11f76d5026b..8fd7f900f75 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -211,14 +211,28 @@ impl PyString { /// Attempts to create a Python string from a Python [bytes-like object]. /// + /// The `encoding` and `errors` parameters are optional: + /// - If `encoding` is `None`, the default encoding is used (UTF-8). + /// - If `errors` is `None`, the default error handling is used ("strict"). + /// + /// See the [Python documentation on codecs] for more information. + /// /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object). + /// [Python documentation on codecs]: https://docs.python.org/3/library/codecs.html#standard-encodings pub fn from_encoded_object<'py>( src: &Bound<'py, PyAny>, - encoding: &CStr, - errors: &CStr, + encoding: Option<&CStr>, + errors: Option<&CStr>, ) -> PyResult> { + let encoding = encoding.map_or_else(|| std::ptr::null(), CStr::as_ptr); + let errors = errors.map_or_else(|| std::ptr::null(), CStr::as_ptr); + // Safety: + // - `src` is a valid Python object + // - `encoding` and `errors` are either null or valid C strings. `encoding` and `errors` are + // documented as allowing null. + // - `ffi::PyUnicode_FromEncodedObject` returns a new `str` object, or sets an error. unsafe { - ffi::PyUnicode_FromEncodedObject(src.as_ptr(), encoding.as_ptr(), errors.as_ptr()) + ffi::PyUnicode_FromEncodedObject(src.as_ptr(), encoding, errors) .assume_owned_or_err(src.py()) .downcast_into_unchecked() } @@ -239,22 +253,7 @@ impl PyString { ) -> PyResult> { let encoding = CString::new(encoding)?; let errors = CString::new(errors)?; - PyString::from_encoded_object(src, &encoding, &errors) - } - - /// Deprecated name for [`PyString::from_object`]. - #[deprecated( - since = "0.23.0", - note = "replcaed with `PyString::from_encoded_object`" - )] - #[inline] - pub fn from_object_bound<'py>( - src: &Bound<'py, PyAny>, - encoding: &str, - errors: &str, - ) -> PyResult> { - #[allow(deprecated)] - Self::from_object(src, encoding, errors) + PyString::from_encoded_object(src, Some(&encoding), Some(&errors)) } } @@ -605,7 +604,7 @@ mod tests { use pyo3_ffi::c_str; use super::*; - use crate::{IntoPyObject, PyObject}; + use crate::{exceptions::PyLookupError, IntoPyObject, PyObject}; #[test] fn test_to_cow_utf8() { @@ -692,13 +691,19 @@ mod tests { } #[test] - fn test_string_from_object() { + fn test_string_from_encoded_object() { Python::with_gil(|py| { let py_bytes = PyBytes::new(py, b"ab\xFFcd"); + // default encoding is utf-8, default error handler is strict + let py_string = PyString::from_encoded_object(&py_bytes, None, None).unwrap_err(); + assert!(py_string + .get_type(py) + .is(&py.get_type::())); + + // with `ignore` error handler, the invalid byte is dropped let py_string = - PyString::from_encoded_object(&py_bytes, c_str!("utf-8"), c_str!("ignore")) - .unwrap(); + PyString::from_encoded_object(&py_bytes, None, Some(c_str!("ignore"))).unwrap(); let result = py_string.to_cow().unwrap(); assert_eq!(result, "abcd"); @@ -712,12 +717,28 @@ mod tests { } #[test] - fn test_string_from_object_with_invalid_encoding_errors() { + fn test_string_from_encoded_object_with_invalid_encoding_errors() { Python::with_gil(|py| { let py_bytes = PyBytes::new(py, b"abcd"); - let result = PyString::from_encoded_object(&py_bytes, c_str!("wat"), c_str!("ignore")); - assert!(result.is_err()); + // invalid encoding + let err = + PyString::from_encoded_object(&py_bytes, Some(c_str!("wat")), None).unwrap_err(); + assert!(err.is_instance(py, &py.get_type::())); + assert_eq!(err.to_string(), "LookupError: unknown encoding: wat"); + + // invalid error handler + let err = PyString::from_encoded_object( + &PyBytes::new(py, b"ab\xFFcd"), + None, + Some(c_str!("wat")), + ) + .unwrap_err(); + assert!(err.is_instance(py, &py.get_type::())); + assert_eq!( + err.to_string(), + "LookupError: unknown error handler name 'wat'" + ); #[allow(deprecated)] let result = PyString::from_object(&py_bytes, "utf\0-8", "ignore"); From f05bf53077650fee30024e1ce8ad00e854c8dc6f Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Sat, 26 Jul 2025 11:58:16 +0100 Subject: [PATCH 3/5] fixup ci --- src/types/string.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/types/string.rs b/src/types/string.rs index 9cd3ba14131..8adfe039864 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -1,4 +1,3 @@ -#[cfg(not(Py_LIMITED_API))] use crate::exceptions::PyUnicodeDecodeError; use crate::ffi_ptr_ext::FfiPtrExt; use crate::instance::Borrowed; @@ -204,8 +203,8 @@ impl PyString { encoding: Option<&CStr>, errors: Option<&CStr>, ) -> PyResult> { - let encoding = encoding.map_or_else(|| std::ptr::null(), CStr::as_ptr); - let errors = errors.map_or_else(|| std::ptr::null(), CStr::as_ptr); + let encoding = encoding.map_or(std::ptr::null(), CStr::as_ptr); + let errors = errors.map_or(std::ptr::null(), CStr::as_ptr); // Safety: // - `src` is a valid Python object // - `encoding` and `errors` are either null or valid C strings. `encoding` and `errors` are From 96b05926672687c3772d88a326bd06efbfe4e59b Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Sat, 26 Jul 2025 18:12:07 +0100 Subject: [PATCH 4/5] fixup --- src/types/string.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/types/string.rs b/src/types/string.rs index 8adfe039864..3ab44800034 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -1,3 +1,4 @@ +#[cfg(not(Py_LIMITED_API))] use crate::exceptions::PyUnicodeDecodeError; use crate::ffi_ptr_ext::FfiPtrExt; use crate::instance::Borrowed; @@ -657,7 +658,7 @@ mod tests { let py_string = PyString::from_encoded_object(&py_bytes, None, None).unwrap_err(); assert!(py_string .get_type(py) - .is(&py.get_type::())); + .is(py.get_type::())); // with `ignore` error handler, the invalid byte is dropped let py_string = From a8f356bfa6123b50814c29356e442910aec649f2 Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Fri, 1 Aug 2025 10:05:55 +0100 Subject: [PATCH 5/5] remove redundant import --- src/types/string.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/types/string.rs b/src/types/string.rs index 3ab44800034..c92c58146e6 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -63,7 +63,6 @@ impl<'a> PyStringData<'a> { /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should /// never occur for strings that were created from Python code. pub fn to_string(self, py: Python<'_>) -> PyResult> { - use std::ffi::CStr; match self { Self::Ucs1(data) => match str::from_utf8(data) { Ok(s) => Ok(Cow::Borrowed(s)),