Skip to content
Merged
42 changes: 42 additions & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,48 @@ Optional libraries below the lowest tested version may still work, but are not c

See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.


.. _whatsnew_150.read_xml_dtypes:

read_xml now supports ``dtype``, ``converters``, and ``parse_dates``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Similar to other IO methods, :func:`pandas.read_xml` now supports assigning specific dtypes to columns,
apply converter methods, and parse dates (:issue:`43567`).

.. ipython:: python

xml_dates = """<?xml version='1.0' encoding='utf-8'?>
<data>
<row>
<shape>square</shape>
<degrees>00360</degrees>
<sides>4.0</sides>
<date>2020-01-01</date>
</row>
<row>
<shape>circle</shape>
<degrees>00360</degrees>
<sides/>
<date>2021-01-01</date>
</row>
<row>
<shape>triangle</shape>
<degrees>00180</degrees>
<sides>3.0</sides>
<date>2022-01-01</date>
</row>
</data>"""

df = pd.read_xml(
xml_dates,
dtype={'sides': 'Int64'},
converters={'degrees': str},
parse_dates=['date']
)
df
df.dtypes

.. _whatsnew_150.api_breaking.other:

Other API changes
Expand Down
13 changes: 11 additions & 2 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@
DtypeArg = Union[Dtype, Dict[Hashable, Dtype]]
DtypeObj = Union[np.dtype, "ExtensionDtype"]

# converters
ConvertersArg = Dict[Hashable, Callable[[Dtype], Dtype]]

# parse_dates
ParseDatesArg = Union[
bool, List[Hashable], List[List[Hashable]], Dict[Hashable, List[Hashable]]
]

# For functions like rename that convert one label to another
Renamer = Union[Mapping[Hashable, Any], Callable[[Hashable], Hashable]]

Expand Down Expand Up @@ -246,8 +254,6 @@ def closed(self) -> bool:
CompressionOptions = Optional[
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict]
]
XMLParsers = Literal["lxml", "etree"]


# types in DataFrameFormatter
FormattersType = Union[
Expand Down Expand Up @@ -295,3 +301,6 @@ def closed(self) -> bool:

# read_csv engines
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]

# read_xml parsers
XMLParsers = Literal["lxml", "etree"]
78 changes: 77 additions & 1 deletion pandas/io/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@

from pandas._typing import (
CompressionOptions,
ConvertersArg,
DtypeArg,
FilePath,
ParseDatesArg,
ReadBuffer,
StorageOptions,
XMLParsers,
Expand Down Expand Up @@ -67,6 +70,23 @@ class _XMLFrameParser:
names : list
Column names for Data Frame of parsed XML data.

dtype : dict
Data type for data or columns. E.g. {{'a': np.float64,
'b': np.int32, 'c': 'Int64'}}

.. versionadded:: 1.5.0

converters : dict, optional
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels.

.. versionadded:: 1.5.0

parse_dates : bool or list of int or names or list of lists or dict
Converts either index or select columns to datetimes

.. versionadded:: 1.5.0

encoding : str
Encoding of xml object or document.

Expand Down Expand Up @@ -109,6 +129,9 @@ def __init__(
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
dtype: DtypeArg | None,
converters: ConvertersArg | None,
parse_dates: ParseDatesArg | None,
encoding: str | None,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
compression: CompressionOptions,
Expand All @@ -120,6 +143,9 @@ def __init__(
self.elems_only = elems_only
self.attrs_only = attrs_only
self.names = names
self.dtype = dtype
self.converters = converters
self.parse_dates = parse_dates
self.encoding = encoding
self.stylesheet = stylesheet
self.is_style = None
Expand Down Expand Up @@ -671,6 +697,9 @@ def _parse(
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
dtype: DtypeArg | None,
converters: ConvertersArg | None,
parse_dates: ParseDatesArg | None,
encoding: str | None,
parser: XMLParsers,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
Expand Down Expand Up @@ -706,6 +735,9 @@ def _parse(
elems_only,
attrs_only,
names,
dtype,
converters,
parse_dates,
encoding,
stylesheet,
compression,
Expand All @@ -722,6 +754,9 @@ def _parse(
elems_only,
attrs_only,
names,
dtype,
converters,
parse_dates,
encoding,
stylesheet,
compression,
Expand All @@ -732,7 +767,13 @@ def _parse(

data_dicts = p.parse_data()

return _data_to_frame(data=data_dicts, **kwargs)
return _data_to_frame(
data=data_dicts,
dtype=dtype,
converters=converters,
parse_dates=parse_dates,
**kwargs,
)


@deprecate_nonkeyword_arguments(
Expand All @@ -749,6 +790,9 @@ def read_xml(
elems_only: bool = False,
attrs_only: bool = False,
names: Sequence[str] | None = None,
dtype: DtypeArg | None = None,
converters: ConvertersArg | None = None,
parse_dates: ParseDatesArg | None = None,
# encoding can not be None for lxml and StringIO input
encoding: str | None = "utf-8",
parser: XMLParsers = "lxml",
Expand Down Expand Up @@ -799,6 +843,35 @@ def read_xml(
Column names for DataFrame of parsed XML data. Use this parameter to
rename original element names and distinguish same named elements.

dtype : Type name or dict of column -> type, optional
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
'c': 'Int64'}}
Use `str` or `object` together with suitable `na_values` settings
to preserve and not interpret dtype.
If converters are specified, they will be applied INSTEAD
of dtype conversion.

.. versionadded:: 1.5.0

converters : dict, optional
Dict of functions for converting values in certain columns. Keys can either
be integers or column labels.

.. versionadded:: 1.5.0

parse_dates : bool or list of int or names or list of lists or dict, default False
Identifiers to parse index or columns to datetime. The behavior is as follows:

* boolean. If True -> try parsing the index.
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
each as a separate date column.
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
a single date column.
* dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
result 'foo'

.. versionadded:: 1.5.0

encoding : str, optional, default 'utf-8'
Encoding of XML document.

Expand Down Expand Up @@ -942,6 +1015,9 @@ def read_xml(
elems_only=elems_only,
attrs_only=attrs_only,
names=names,
dtype=dtype,
converters=converters,
parse_dates=parse_dates,
encoding=encoding,
parser=parser,
stylesheet=stylesheet,
Expand Down
Loading