Skip to content
77 changes: 77 additions & 0 deletions pandas/_libs/parsers.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from typing import (
Hashable,
Literal,
)

import numpy as np

from pandas._typing import (
ArrayLike,
Dtype,
)

STR_NA_VALUES: set[str]


def sanitize_objects(
values: np.ndarray, # ndarray[object]
na_values: set,
convert_empty: bool = ...,
) -> int: ...


class TextReader:
unnamed_cols: set[str]
table_width: int # int64_t
leading_cols: int # int64_t
header: list[list[int]] # non-negative integers

def __init__(
self,
source,
delimiter: bytes | str = ..., # single-character only
header=...,
header_start=...,
header_end=...,
index_col=...,
names=...,
tokenize_chunksize: int = ..., # int64_t
delim_whitespace: bool = ...,
converters=...,
skipinitialspace: bool = ...,
escapechar: bytes | str | None = ..., # single-character only
doublequote: bool = ...,
quotechar: str | bytes | None = ..., # at most 1 character
quoting: int = ...,
lineterminator: bytes | str | None = ..., # at most 1 character
comment=...,
decimal: bytes | str = ..., # single-character only
thousands: bytes | str | None = ..., # single-character only
dtype: Dtype | dict[Hashable, Dtype] = ...,
usecols=...,
error_bad_lines: bool = ...,
warn_bad_lines: bool = ...,
na_filter: bool = ...,
na_values=...,
na_fvalues=...,
keep_default_na: bool = ...,
true_values=...,
false_values=...,
allow_leading_cols: bool = ...,
low_memory: bool = ...,
skiprows=...,
skipfooter: int = ..., # int64_t
verbose: bool = ...,
mangle_dupe_cols: bool = ...,
float_precision: Literal["round_trip", "legacy", "high"] | None = ...,
skip_blank_lines: bool = ...,
encoding_errors: bytes | str = ...
): ...

def set_error_bad_lines(self, status: int) -> None: ...
def set_noconvert(self, i: int) -> None: ...
def remove_noconvert(self, i: int) -> None: ...

def close(self) -> None: ...

def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ...
89 changes: 57 additions & 32 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -319,19 +319,21 @@ cdef class TextReader:
int64_t leading_cols, table_width, skipfooter, buffer_lines
bint allow_leading_cols, mangle_dupe_cols, low_memory
bint delim_whitespace
object delimiter, converters
object delimiter # bytes or str
object converters
object na_values
object header, orig_header, names, header_start, header_end
object orig_header, names, header_start, header_end
list header # list[list[non-negative integers]]
object index_col
object skiprows
object dtype
object usecols
list dtype_cast_order # list[np.dtype]
set unnamed_cols
set noconvert
set unnamed_cols # set[str]
set noconvert # set[int]

def __cinit__(self, source,
delimiter=b',',
delimiter=b',', # bytes | str
header=0,
header_start=0,
header_end=0,
Expand All @@ -341,14 +343,14 @@ cdef class TextReader:
bint delim_whitespace=False,
converters=None,
bint skipinitialspace=False,
escapechar=None,
escapechar=None, # bytes | str
bint doublequote=True,
quotechar=b'"',
quoting=0,
lineterminator=None,
quoting=0, # int
lineterminator=None, # bytes | str
comment=None,
decimal=b'.',
thousands=None,
decimal=b'.', # bytes | str
thousands=None, # bytes | str
dtype=None,
usecols=None,
bint error_bad_lines=True,
Expand All @@ -362,7 +364,7 @@ cdef class TextReader:
bint allow_leading_cols=True,
bint low_memory=False,
skiprows=None,
skipfooter=0,
skipfooter=0, # int64_t
bint verbose=False,
bint mangle_dupe_cols=True,
float_precision=None,
Expand Down Expand Up @@ -518,7 +520,7 @@ cdef class TextReader:
self.parser.header_end = -1
self.parser.header = -1
self.parser_start = 0
self.header = []
prelim_header = []
else:
if isinstance(header, list):
if len(header) > 1:
Expand All @@ -534,16 +536,19 @@ cdef class TextReader:
self.parser_start = header[-1] + 1
self.parser.header_start = header[0]
self.parser.header = header[0]
self.header = header
prelim_header = header
else:
self.parser.header_start = header
self.parser.header_end = header
self.parser_start = header + 1
self.parser.header = header
self.header = [ header ]
prelim_header = [ header ]

self.names = names
self.header, self.table_width, self.unnamed_cols = self._get_header()
header, table_width, unnamed_cols = self._get_header(prelim_header)
self.header = header
self.table_width = table_width
self.unnamed_cols = unnamed_cols

if not self.table_width:
raise EmptyDataError("No columns to parse from file")
Expand All @@ -561,7 +566,7 @@ cdef class TextReader:
self.close()
parser_del(self.parser)

def close(self):
def close(self) -> None:
# also preemptively free all allocated memory
parser_free(self.parser)
if self.true_set:
Expand All @@ -571,10 +576,10 @@ cdef class TextReader:
kh_destroy_str_starts(self.false_set)
self.false_set = NULL

def set_error_bad_lines(self, int status):
def set_error_bad_lines(self, int status) -> None:
self.parser.error_bad_lines = status

def _set_quoting(self, quote_char, quoting):
def _set_quoting(self, quote_char: str | bytes | None, quoting: int):
if not isinstance(quoting, int):
raise TypeError('"quoting" must be an integer')

Expand Down Expand Up @@ -618,21 +623,21 @@ cdef class TextReader:
self.parser.cb_io = &buffer_rd_bytes
self.parser.cb_cleanup = &del_rd_source

cdef _get_header(self):
cdef _get_header(self, list prelim_header):
# header is now a list of lists, so field_count should use header[0]

cdef:
Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
char *word
object name, old_name
str name, old_name
uint64_t hr, data_line = 0
list header = []
set unnamed_cols = set()

if self.parser.header_start >= 0:

# Header is in the file
for level, hr in enumerate(self.header):
for level, hr in enumerate(prelim_header):

this_header = []

Expand Down Expand Up @@ -697,7 +702,7 @@ cdef class TextReader:
# If we have grabbed an extra line, but it's not in our
# format, save in the buffer, and create an blank extra
# line for the rest of the parsing code.
if hr == self.header[-1]:
if hr == prelim_header[-1]:
lc = len(this_header)
ic = (len(self.index_col) if self.index_col
is not None else 0)
Expand Down Expand Up @@ -764,7 +769,7 @@ cdef class TextReader:

return header, field_count, unnamed_cols

def read(self, rows=None):
def read(self, rows: int | None = None) -> dict[int, "ArrayLike"]:
"""
rows=None --> read all rows
"""
Expand All @@ -777,6 +782,7 @@ cdef class TextReader:

return columns

# -> dict[int, "ArrayLike"]
cdef _read_low_memory(self, rows):
cdef:
size_t rows_read = 0
Expand Down Expand Up @@ -830,6 +836,7 @@ cdef class TextReader:
if status < 0:
raise_parser_error('Error tokenizing data', self.parser)

# -> dict[int, "ArrayLike"]
cdef _read_rows(self, rows, bint trim):
cdef:
int64_t buffered_lines
Expand Down Expand Up @@ -889,13 +896,16 @@ cdef class TextReader:
elapsed = time.time() - self.clocks.pop(-1)
print(f'{what} took: {elapsed * 1000:.2f} ms')

def set_noconvert(self, i):
def set_noconvert(self, i: int) -> None:
self.noconvert.add(i)

def remove_noconvert(self, i):
def remove_noconvert(self, i: int) -> None:
self.noconvert.remove(i)

def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
# TODO: upcast_na only ever False, footer never passed
def _convert_column_data(
self, rows: int | None = None, upcast_na: bool = False, footer: int = 0
) -> dict[int, "ArrayLike"]:
cdef:
int64_t i
int nused
Expand All @@ -904,6 +914,7 @@ cdef class TextReader:
object name, na_flist, col_dtype = None
bint na_filter = 0
int64_t num_cols
dict result

start = self.parser_start

Expand Down Expand Up @@ -1020,6 +1031,7 @@ cdef class TextReader:

return results

# -> tuple["ArrayLike", int]:
cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end,
object name, bint na_filter,
kh_str_starts_t *na_hashset,
Expand Down Expand Up @@ -1181,13 +1193,14 @@ cdef class TextReader:
else:
raise TypeError(f"the dtype {dtype} is not supported for parsing")

# -> tuple[ndarray[object], int]
cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
bint na_filter, kh_str_starts_t *na_hashset):

return _string_box_utf8(self.parser, i, start, end, na_filter,
na_hashset, self.encoding_errors)

def _get_converter(self, i, name):
def _get_converter(self, i: int, name):
if self.converters is None:
return None

Expand All @@ -1197,7 +1210,7 @@ cdef class TextReader:
# Converter for position, if any
return self.converters.get(i)

cdef _get_na_list(self, i, name):
cdef _get_na_list(self, Py_ssize_t i, name):
if self.na_values is None:
return None, set()

Expand Down Expand Up @@ -1319,6 +1332,7 @@ def _maybe_upcast(arr):
# Type conversions / inference support code


# -> tuple[ndarray[object], int]
cdef _string_box_utf8(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset,
Expand Down Expand Up @@ -1432,6 +1446,7 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
return np.asarray(codes), result, na_count


# -> ndarray[f'|S{width}']
cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
int64_t line_end, int64_t width):
cdef:
Expand Down Expand Up @@ -1473,6 +1488,7 @@ cdef:
char* cneginfty = b'-Infinity'


# -> tuple[ndarray[float64_t], int] | tuple[None, None]
cdef _try_double(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset, object na_flist):
Expand All @@ -1482,7 +1498,7 @@ cdef _try_double(parser_t *parser, int64_t col,
float64_t *data
float64_t NA = na_values[np.float64]
kh_float64_t *na_fset
ndarray result
ndarray[float64_t] result
bint use_na_flist = len(na_flist) > 0

lines = line_end - line_start
Expand Down Expand Up @@ -1712,6 +1728,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,
return 0


# -> tuple[ndarray[bool], int]
cdef _try_bool_flex(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, const kh_str_starts_t *na_hashset,
Expand Down Expand Up @@ -1890,7 +1907,9 @@ cdef raise_parser_error(object base, parser_t *parser):
raise ParserError(message)


def _concatenate_chunks(list chunks):
# chunks: list[dict[int, "ArrayLike"]]
# -> dict[int, "ArrayLike"]
def _concatenate_chunks(list chunks) -> dict:
cdef:
list names = list(chunks[0].keys())
object name
Expand Down Expand Up @@ -1964,6 +1983,7 @@ for k in list(na_values):
na_values[np.dtype(k)] = na_values[k]


# -> ArrayLike
cdef _apply_converter(object f, parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end):
cdef:
Expand All @@ -1986,14 +2006,15 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
return lib.maybe_convert_objects(result)


def _maybe_encode(values):
cdef list _maybe_encode(list values):
if values is None:
return []
return [x.encode('utf-8') if isinstance(x, str) else x for x in values]


# TODO: only ever called with convert_empty=False
def sanitize_objects(ndarray[object] values, set na_values,
bint convert_empty=True):
bint convert_empty=True) -> int:
"""
Convert specified values, including the given set na_values and empty
strings if convert_empty is True, to np.nan.
Expand All @@ -2003,6 +2024,10 @@ def sanitize_objects(ndarray[object] values, set na_values,
values : ndarray[object]
na_values : set
convert_empty : bool, default True

Returns
-------
na_count : int
"""
cdef:
Py_ssize_t i, n
Expand Down
Loading