Skip to content
4 changes: 4 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1624,6 +1624,10 @@ cdef class StringValidator(Validator):
cdef inline bint is_array_typed(self) except -1:
return issubclass(self.dtype.type, np.str_)

cdef bint is_valid_null(self, object value) except -1:
# We deliberately exclude None / NaN here since StringArray uses NA
return value is C_NA


cpdef bint is_string_array(ndarray values, bint skipna=False):
cdef:
Expand Down
34 changes: 22 additions & 12 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,6 @@ class StringArray(PandasArray):
StringArray is considered experimental. The implementation and
parts of the API may change without warning.

In particular, the NA value used may change to no longer be
``numpy.nan``.

Parameters
----------
values : array-like
Expand All @@ -104,8 +101,11 @@ class StringArray(PandasArray):
.. warning::

Currently, this expects an object-dtype ndarray
where the elements are Python strings. This may
change without warning in the future.
where the elements are Python strings or :attr:`pandas.NA`.
This may change without warning in the future. Use
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
creating a `StringArray` from any sequence.

copy : bool, default False
Whether to copy the array of data.

Expand All @@ -119,6 +119,8 @@ class StringArray(PandasArray):

See Also
--------
array
The recommended function for creating a StringArray.
Series.str
The string methods are available on Series backed by
a StringArray.
Expand Down Expand Up @@ -165,25 +167,33 @@ def __init__(self, values, copy=False):
def _validate(self):
"""Validate that we only store NA or strings."""
if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
raise ValueError(
"StringArray requires a sequence of strings or missing values."
)
raise ValueError("StringArray requires a sequence of strings or pandas.NA")
if self._ndarray.dtype != "object":
raise ValueError(
"StringArray requires a sequence of strings. Got "
"StringArray requires a sequence of strings or pandas.NA. Got "
f"'{self._ndarray.dtype}' dtype instead."
)

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
if dtype:
assert dtype == "string"
result = super()._from_sequence(scalars, dtype=object, copy=copy)

result = np.asarray(scalars, dtype="object")
if copy and result is scalars:
result = result.copy()

# Standardize all missing-like values to NA
# TODO: it would be nice to do this in _validate / lib.is_string_array
# We are already doing a scan over the values there.
result[result.isna()] = StringDtype.na_value
return result
na_values = isna(result)
if na_values.any():
if result is scalars:
# force a copy now, if we haven't already
result = result.copy()
result[na_values] = StringDtype.na_value

return cls(result)

@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np

import pandas._libs.lib as lib
import pandas._libs.missing as libmissing
import pandas._libs.ops as libops
from pandas._typing import ArrayLike, Dtype
from pandas.util._decorators import Appender
Expand Down Expand Up @@ -118,12 +119,15 @@ def cat_safe(list_of_columns: List, sep: str):
return result


def _na_map(f, arr, na_result=np.nan, dtype=object):
# should really _check_ for NA
def _na_map(f, arr, na_result=None, dtype=object):
if is_extension_array_dtype(arr.dtype):
if na_result is None:
na_result = libmissing.NA
# just StringDtype
arr = extract_array(arr)
return _map_stringarray(f, arr, na_value=na_result, dtype=dtype)
if na_result is None:
na_result = np.nan
return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype)


Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,25 @@ def test_constructor_raises():
with pytest.raises(ValueError, match="sequence of strings"):
pd.arrays.StringArray(np.array([]))

with pytest.raises(ValueError, match="strings or pandas.NA"):
pd.arrays.StringArray(np.array(["a", np.nan], dtype=object))

with pytest.raises(ValueError, match="strings or pandas.NA"):
pd.arrays.StringArray(np.array(["a", None], dtype=object))

with pytest.raises(ValueError, match="strings or pandas.NA"):
pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object))


@pytest.mark.parametrize("copy", [True, False])
def test_from_sequence_no_mutate(copy):
a = np.array(["a", np.nan], dtype=object)
original = a.copy()
result = pd.arrays.StringArray._from_sequence(a, copy=copy)
expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object))
tm.assert_extension_array_equal(result, expected)
tm.assert_numpy_array_equal(a, original)


@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.xfail(reason="Not implemented StringArray.sum")
Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1114,11 +1114,16 @@ def test_is_string_array(self):

assert lib.is_string_array(np.array(["foo", "bar"]))
assert not lib.is_string_array(
np.array(["foo", "bar", np.nan], dtype=object), skipna=False
np.array(["foo", "bar", pd.NA], dtype=object), skipna=False
)
assert lib.is_string_array(
np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
)
# NaN is not valid for string array, just NA
assert not lib.is_string_array(
np.array(["foo", "bar", np.nan], dtype=object), skipna=True
)

assert not lib.is_string_array(np.array([1, 2]))

def test_to_object_array_tuples(self):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3521,7 +3521,7 @@ def test_string_array(any_string_method):

if isinstance(expected, Series):
if expected.dtype == "object" and lib.is_string_array(
expected.values, skipna=True
expected.dropna().values,
):
assert result.dtype == "string"
result = result.astype(object)
Expand Down