@@ -319,19 +319,21 @@ cdef class TextReader:
319319 int64_t leading_cols, table_width, skipfooter, buffer_lines
320320 bint allow_leading_cols, mangle_dupe_cols, low_memory
321321 bint delim_whitespace
322- object delimiter, converters
322+ object delimiter # bytes or str
323+ object converters
323324 object na_values
324- object header, orig_header, names, header_start, header_end
325+ object orig_header, names, header_start, header_end
326+ list header # list[list[non-negative integers]]
325327 object index_col
326328 object skiprows
327329 object dtype
328330 object usecols
329331 list dtype_cast_order # list[np.dtype]
330- set unnamed_cols
331- set noconvert
332+ set unnamed_cols # set[str]
333+ set noconvert # set[int]
332334
333335 def __cinit__ (self , source ,
334- delimiter = b' ,' ,
336+ delimiter = b' ,' , # bytes | str
335337 header = 0 ,
336338 header_start = 0 ,
337339 header_end = 0 ,
@@ -341,14 +343,14 @@ cdef class TextReader:
341343 bint delim_whitespace = False ,
342344 converters = None ,
343345 bint skipinitialspace = False ,
344- escapechar = None ,
346+ escapechar = None , # bytes | str
345347 bint doublequote = True ,
346348 quotechar = b' "' ,
347- quoting = 0 ,
348- lineterminator = None ,
349+ quoting = 0 , # int
350+ lineterminator = None , # bytes | str
349351 comment = None ,
350- decimal = b' .' ,
351- thousands = None ,
352+ decimal = b' .' , # bytes | str
353+ thousands = None , # bytes | str
352354 dtype = None ,
353355 usecols = None ,
354356 bint error_bad_lines = True ,
@@ -362,7 +364,7 @@ cdef class TextReader:
362364 bint allow_leading_cols = True ,
363365 bint low_memory = False ,
364366 skiprows = None ,
365- skipfooter = 0 ,
367+ skipfooter = 0 , # int64_t
366368 bint verbose = False ,
367369 bint mangle_dupe_cols = True ,
368370 float_precision = None ,
@@ -518,7 +520,7 @@ cdef class TextReader:
518520 self .parser.header_end = - 1
519521 self .parser.header = - 1
520522 self .parser_start = 0
521- self .header = []
523+ prelim_header = []
522524 else :
523525 if isinstance (header, list ):
524526 if len (header) > 1 :
@@ -534,16 +536,19 @@ cdef class TextReader:
534536 self .parser_start = header[- 1 ] + 1
535537 self .parser.header_start = header[0 ]
536538 self .parser.header = header[0 ]
537- self .header = header
539+ prelim_header = header
538540 else :
539541 self .parser.header_start = header
540542 self .parser.header_end = header
541543 self .parser_start = header + 1
542544 self .parser.header = header
543- self .header = [ header ]
545+ prelim_header = [ header ]
544546
545547 self .names = names
546- self .header, self .table_width, self .unnamed_cols = self ._get_header()
548+ header, table_width, unnamed_cols = self ._get_header(prelim_header)
549+ self .header = header
550+ self .table_width = table_width
551+ self .unnamed_cols = unnamed_cols
547552
548553 if not self .table_width:
549554 raise EmptyDataError(" No columns to parse from file" )
@@ -561,7 +566,7 @@ cdef class TextReader:
561566 self .close()
562567 parser_del(self .parser)
563568
564- def close (self ):
569+ def close (self ) -> None :
565570 # also preemptively free all allocated memory
566571 parser_free(self.parser )
567572 if self.true_set:
@@ -571,10 +576,10 @@ cdef class TextReader:
571576 kh_destroy_str_starts(self.false_set )
572577 self.false_set = NULL
573578
574- def set_error_bad_lines (self , int status ):
579+ def set_error_bad_lines(self , int status ) -> None :
575580 self.parser.error_bad_lines = status
576581
577- def _set_quoting (self , quote_char , quoting ):
582+ def _set_quoting(self , quote_char: str | bytes | None , quoting: int ):
578583 if not isinstance (quoting, int ):
579584 raise TypeError (' "quoting" must be an integer' )
580585
@@ -618,21 +623,21 @@ cdef class TextReader:
618623 self .parser.cb_io = & buffer_rd_bytes
619624 self .parser.cb_cleanup = & del_rd_source
620625
621- cdef _get_header(self ):
626+ cdef _get_header(self , list prelim_header ):
622627 # header is now a list of lists, so field_count should use header[0]
623628
624629 cdef:
625630 Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
626631 char * word
627- object name, old_name
632+ str name, old_name
628633 uint64_t hr, data_line = 0
629634 list header = []
630635 set unnamed_cols = set ()
631636
632637 if self .parser.header_start >= 0 :
633638
634639 # Header is in the file
635- for level, hr in enumerate (self .header ):
640+ for level, hr in enumerate (prelim_header ):
636641
637642 this_header = []
638643
@@ -697,7 +702,7 @@ cdef class TextReader:
697702 # If we have grabbed an extra line, but it's not in our
698703 # format, save in the buffer, and create an blank extra
699704 # line for the rest of the parsing code.
700- if hr == self .header [- 1 ]:
705+ if hr == prelim_header [- 1 ]:
701706 lc = len (this_header)
702707 ic = (len (self .index_col) if self .index_col
703708 is not None else 0 )
@@ -764,7 +769,7 @@ cdef class TextReader:
764769
765770 return header, field_count, unnamed_cols
766771
767- def read (self , rows = None ):
772+ def read (self , rows: int | None = None ) -> dict[ int , "ArrayLike"] :
768773 """
769774 rows = None -- > read all rows
770775 """
@@ -777,6 +782,7 @@ cdef class TextReader:
777782
778783 return columns
779784
785+ # -> dict[int , "ArrayLike"]
780786 cdef _read_low_memory(self , rows ):
781787 cdef:
782788 size_t rows_read = 0
@@ -830,6 +836,7 @@ cdef class TextReader:
830836 if status < 0 :
831837 raise_parser_error(' Error tokenizing data' , self .parser)
832838
839+ # -> dict[int, "ArrayLike"]
833840 cdef _read_rows(self , rows, bint trim):
834841 cdef:
835842 int64_t buffered_lines
@@ -889,13 +896,16 @@ cdef class TextReader:
889896 elapsed = time.time() - self .clocks.pop(- 1 )
890897 print (f' {what} took: {elapsed * 1000:.2f} ms' )
891898
892- def set_noconvert (self , i ) :
899+ def set_noconvert (self , i: int ) -> None :
893900 self.noconvert.add(i )
894901
895- def remove_noconvert (self , i ) :
902+ def remove_noconvert(self , i: int ) -> None :
896903 self.noconvert.remove(i )
897904
898- def _convert_column_data (self , rows = None , upcast_na = False , footer = 0 ):
905+ # TODO: upcast_na only ever False , footer never passed
906+ def _convert_column_data(
907+ self , rows: int | None = None , upcast_na: bool = False , footer: int = 0
908+ ) -> dict[int , "ArrayLike"]:
899909 cdef:
900910 int64_t i
901911 int nused
@@ -904,6 +914,7 @@ cdef class TextReader:
904914 object name , na_flist , col_dtype = None
905915 bint na_filter = 0
906916 int64_t num_cols
917+ dict result
907918
908919 start = self .parser_start
909920
@@ -1020,6 +1031,7 @@ cdef class TextReader:
10201031
10211032 return results
10221033
1034+ # -> tuple["ArrayLike", int]:
10231035 cdef inline _convert_tokens(self , Py_ssize_t i, int start, int end,
10241036 object name, bint na_filter,
10251037 kh_str_starts_t * na_hashset,
@@ -1181,13 +1193,14 @@ cdef class TextReader:
11811193 else :
11821194 raise TypeError (f" the dtype {dtype} is not supported for parsing" )
11831195
1196+ # -> tuple[ndarray[object], int]
11841197 cdef _string_convert(self , Py_ssize_t i, int64_t start, int64_t end,
11851198 bint na_filter, kh_str_starts_t * na_hashset):
11861199
11871200 return _string_box_utf8(self .parser, i, start, end, na_filter,
11881201 na_hashset, self .encoding_errors)
11891202
1190- def _get_converter (self , i , name ):
1203+ def _get_converter (self , i: int , name ):
11911204 if self .converters is None :
11921205 return None
11931206
@@ -1197,7 +1210,7 @@ cdef class TextReader:
11971210 # Converter for position, if any
11981211 return self .converters.get(i)
11991212
1200- cdef _get_na_list(self , i, name):
1213+ cdef _get_na_list(self , Py_ssize_t i, name):
12011214 if self .na_values is None :
12021215 return None , set ()
12031216
@@ -1319,6 +1332,7 @@ def _maybe_upcast(arr):
13191332# Type conversions / inference support code
13201333
13211334
1335+ # -> tuple[ndarray[object], int]
13221336cdef _string_box_utf8(parser_t * parser, int64_t col,
13231337 int64_t line_start, int64_t line_end,
13241338 bint na_filter, kh_str_starts_t * na_hashset,
@@ -1432,6 +1446,7 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
14321446 return np.asarray(codes), result, na_count
14331447
14341448
1449+ # -> ndarray[f'|S{width}']
14351450cdef _to_fw_string(parser_t * parser, int64_t col, int64_t line_start,
14361451 int64_t line_end, int64_t width):
14371452 cdef:
@@ -1473,6 +1488,7 @@ cdef:
14731488 char * cneginfty = b' -Infinity'
14741489
14751490
1491+ # -> tuple[ndarray[float64_t], int] | tuple[None, None]
14761492cdef _try_double(parser_t * parser, int64_t col,
14771493 int64_t line_start, int64_t line_end,
14781494 bint na_filter, kh_str_starts_t * na_hashset, object na_flist):
@@ -1482,7 +1498,7 @@ cdef _try_double(parser_t *parser, int64_t col,
14821498 float64_t * data
14831499 float64_t NA = na_values[np.float64]
14841500 kh_float64_t * na_fset
1485- ndarray result
1501+ ndarray[float64_t] result
14861502 bint use_na_flist = len (na_flist) > 0
14871503
14881504 lines = line_end - line_start
@@ -1712,6 +1728,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,
17121728 return 0
17131729
17141730
1731+ # -> tuple[ndarray[bool], int]
17151732cdef _try_bool_flex(parser_t * parser, int64_t col,
17161733 int64_t line_start, int64_t line_end,
17171734 bint na_filter, const kh_str_starts_t * na_hashset,
@@ -1890,7 +1907,9 @@ cdef raise_parser_error(object base, parser_t *parser):
18901907 raise ParserError(message)
18911908
18921909
1893- def _concatenate_chunks (list chunks ):
1910+ # chunks: list[dict[int, "ArrayLike"]]
1911+ # -> dict[int, "ArrayLike"]
1912+ def _concatenate_chunks (list chunks ) -> dict:
18941913 cdef:
18951914 list names = list (chunks[0 ].keys())
18961915 object name
@@ -1964,6 +1983,7 @@ for k in list(na_values):
19641983 na_values[np.dtype(k)] = na_values[k]
19651984
19661985
1986+ # -> ArrayLike
19671987cdef _apply_converter(object f, parser_t * parser, int64_t col,
19681988 int64_t line_start, int64_t line_end):
19691989 cdef:
@@ -1986,14 +2006,15 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
19862006 return lib.maybe_convert_objects(result)
19872007
19882008
1989- def _maybe_encode (values ):
2009+ cdef list _maybe_encode(list values):
19902010 if values is None :
19912011 return []
19922012 return [x.encode(' utf-8' ) if isinstance (x, str ) else x for x in values]
19932013
19942014
2015+ # TODO: only ever called with convert_empty=False
19952016def sanitize_objects (ndarray[object] values , set na_values ,
1996- bint convert_empty = True ):
2017+ bint convert_empty = True ) -> int :
19972018 """
19982019 Convert specified values , including the given set na_values and empty
19992020 strings if convert_empty is True , to np.nan.
@@ -2003,6 +2024,10 @@ def sanitize_objects(ndarray[object] values, set na_values,
20032024 values : ndarray[object]
20042025 na_values : set
20052026 convert_empty : bool , default True
2027+
2028+ Returns
2029+ -------
2030+ na_count : int
20062031 """
20072032 cdef:
20082033 Py_ssize_t i , n
0 commit comments