@@ -34,7 +34,10 @@ import numpy as np
3434cimport util
3535
3636import pandas.lib as lib
37- from pandas.core.common import is_categorical_dtype, CategoricalDtype
37+ from pandas.core.common import (is_categorical_dtype, CategoricalDtype,
38+ is_integer_dtype, is_float_dtype,
39+ is_bool_dtype, is_object_dtype,
40+ is_string_dtype, is_datetime64_dtype)
3841from pandas.core.categorical import Categorical
3942from pandas.types.concat import union_categoricals
4043
@@ -224,19 +227,13 @@ cdef extern from "parser/tokenizer.h":
224227 int to_boolean(const char * item, uint8_t * val) nogil
225228
226229
227- # XXX
228- # this is a hack - in order to make the inference
229- # functions generic (converting either data directly
230- # from the parser or from a passed in hash table)
231- # we add an "optional" parameter via fused type, that can either
232- # be the hash table to parse, or an integer, which is used
233- # as a sentinel to specialize the function for reading
234- # from the parser.
235230
236- # This is to avoid duplicating a bunch of code or
237- # adding runtime checks, but may be too much
231+ # to make the inference functions generic
232+ # add an optional last parameter that is
233+ # the source of data to be used
234+ # other than the parser_t
238235ctypedef kh_str_t* kh_str_t_p
239- ctypedef int use_parser_data
236+ ctypedef void * use_parser_data
240237
241238ctypedef fused inference_data_t:
242239 kh_str_t_p
@@ -421,11 +418,12 @@ cdef class TextReader:
421418
422419 self ._set_quoting(quotechar, quoting)
423420
424- # TODO: endianness just a placeholder?
421+
422+ dtype_order = [' int64' , ' float64' , ' bool' , ' object' ]
425423 if quoting == QUOTE_NONNUMERIC:
426- self .dtype_cast_order = [ ' <f8 ' , ' <i8 ' , ' |b1 ' , ' |O8 ' ]
427- else :
428- self .dtype_cast_order = [' <i8 ' , ' <f8 ' , ' |b1 ' , ' |O8 ' ]
424+ # consistent with csv module semantics, cast all to float
425+ dtype_order = dtype_order[ 1 :]
426+ self .dtype_cast_order = [np.dtype(x) for x in dtype_order ]
429427
430428 if comment is not None :
431429 if len (comment) > 1 :
@@ -1108,12 +1106,6 @@ cdef class TextReader:
11081106 col_dtype = self .dtype
11091107
11101108 if col_dtype is not None :
1111- if not isinstance (col_dtype, basestring ):
1112- if isinstance (col_dtype, np.dtype) or is_categorical_dtype(col_dtype):
1113- col_dtype = col_dtype.str
1114- else :
1115- col_dtype = np.dtype(col_dtype).str
1116-
11171109 col_res, na_count = self ._convert_with_dtype(col_dtype, i, start, end,
11181110 na_filter, 1 , na_hashset, na_flist)
11191111
@@ -1131,7 +1123,7 @@ cdef class TextReader:
11311123 dt, i, start, end, na_filter, 0 , na_hashset, na_flist)
11321124 except OverflowError :
11331125 col_res, na_count = self ._convert_with_dtype(
1134- ' |O8 ' , i, start, end, na_filter, 0 , na_hashset, na_flist)
1126+ np.dtype( ' object ' ) , i, start, end, na_filter, 0 , na_hashset, na_flist)
11351127
11361128 if col_res is not None :
11371129 break
@@ -1163,70 +1155,66 @@ cdef class TextReader:
11631155 bint user_dtype,
11641156 kh_str_t * na_hashset,
11651157 object na_flist):
1166- if dtype[ 1 ] == ' i ' or dtype[ 1 ] == ' u ' :
1167- result, na_count = _try_int64(self .parser, i, start, end ,
1168- na_filter, na_hashset ,
1169- < use_parser_data > NULL )
1158+ if is_integer_dtype( dtype) :
1159+ result, na_count = _try_int64[use_parser_data] (self .parser, i,
1160+ start, end, na_filter ,
1161+ na_hashset, NULL )
11701162 if user_dtype and na_count is not None :
11711163 if na_count > 0 :
11721164 raise ValueError (" Integer column has NA values in "
11731165 " column {column}" .format(column = i))
11741166
1175- if result is not None and dtype[ 1 :] != ' i8 ' :
1167+ if result is not None and dtype != ' int64 ' :
11761168 result = result.astype(dtype)
11771169
11781170 return result, na_count
11791171
1180- elif dtype[ 1 ] == ' f ' :
1181- result, na_count = _try_double(self .parser, i, start, end,
1182- na_filter, na_hashset, na_flist,
1183- < use_parser_data > NULL )
1172+ elif is_float_dtype( dtype) :
1173+ result, na_count = _try_double[use_parser_data] (self .parser, i, start, end,
1174+ na_filter, na_hashset, na_flist,
1175+ NULL )
11841176
1185- if result is not None and dtype[ 1 :] != ' f8 ' :
1177+ if result is not None and dtype != ' float64 ' :
11861178 result = result.astype(dtype)
11871179 return result, na_count
11881180
1189- elif dtype[ 1 ] == ' b ' :
1190- result, na_count = _try_bool_flex(self .parser, i, start, end,
1191- na_filter, na_hashset,
1192- self .true_set, self .false_set,
1193- < use_parser_data > NULL )
1181+ elif is_bool_dtype( dtype) :
1182+ result, na_count = _try_bool_flex[use_parser_data] (self .parser, i, start, end,
1183+ na_filter, na_hashset,
1184+ self .true_set, self .false_set,
1185+ NULL )
11941186 return result, na_count
1195- elif dtype[1 ] == ' c' :
1196- raise NotImplementedError (" the dtype %s is not supported for parsing" % dtype)
1197-
1198- elif dtype[1 ] == ' S' :
1187+ elif dtype.kind == ' S' :
11991188 # TODO: na handling
1200- width = int ( dtype[ 2 :])
1189+ width = dtype.itemsize
12011190 if width > 0 :
12021191 result = _to_fw_string(self .parser, i, start, end, width)
12031192 return result, 0
12041193
12051194 # treat as a regular string parsing
12061195 return self ._string_convert(i, start, end, na_filter,
12071196 na_hashset)
1208- elif dtype[ 1 ] == ' U' :
1209- width = int ( dtype[ 2 :])
1197+ elif dtype.kind == ' U' :
1198+ width = dtype.itemsize
12101199 if width > 0 :
12111200 raise NotImplementedError (" the dtype %s is not supported for parsing" % dtype)
12121201
12131202 # unicode variable width
12141203 return self ._string_convert(i, start, end, na_filter,
12151204 na_hashset)
12161205 # is this comparison good enough?
1217- elif dtype == ' |O08 ' :
1206+ elif is_categorical_dtype( dtype) :
12181207 codes, cats, na_count = _categorical_convert(self .parser, i, start,
12191208 end, na_filter, na_hashset,
12201209 na_flist, self .true_set,
12211210 self .false_set, self .c_encoding)
1222-
12231211 return Categorical(codes, categories = cats, ordered = False ,
12241212 fastpath = True ), na_count
1225- elif dtype[ 1 ] == ' O ' :
1213+ elif is_object_dtype( dtype) :
12261214 return self ._string_convert(i, start, end, na_filter,
12271215 na_hashset)
12281216 else :
1229- if dtype[ 1 ] == ' M ' :
1217+ if is_datetime64_dtype( dtype) :
12301218 raise TypeError (" the dtype %s is not supported for parsing, "
12311219 " pass this column using parse_dates instead" % dtype)
12321220 raise TypeError (" the dtype %s is not supported for parsing" % dtype)
@@ -1588,7 +1576,7 @@ cdef _categorical_convert(parser_t *parser, int col,
15881576
15891577 codes[i] = table.vals[k]
15901578
1591-
1579+ # Codes are complete, now inference on cats
15921580 # follow the same inference attempts as
15931581 # normal data (int64, float64, bool, object)
15941582 result, result_na = _try_int64(parser, col, 0 , table.n_occupied,
@@ -1603,9 +1591,10 @@ cdef _categorical_convert(parser_t *parser, int col,
16031591 result, result_na = _try_bool_flex(parser, col, 0 , table.n_occupied,
16041592 na_filter, na_hashset, true_hashset,
16051593 false_hashset, table)
1606- # duplicated logic here, but doesn't make sense to reuse
1607- # other string logic since those paths factorize where we
1608- # already have guaranteed uniques
1594+
1595+ # if no numeric types parsed, convert to object.
1596+ # Note that the decoding path logic should sync up with that
1597+ # of `TextReader.string_convert`
16091598 if result is None :
16101599 i = 0
16111600 result = np.empty(table.n_occupied, dtype = np.object_)
@@ -1694,10 +1683,10 @@ cdef inline _try_double(parser_t *parser, int col, int line_start, int line_end,
16941683
16951684
16961685cdef inline int _try_double_nogil(parser_t * parser, int col, int line_start, int line_end,
1697- bint na_filter, kh_str_t * na_hashset, bint use_na_flist,
1698- const kh_float64_t * na_flist,
1699- double NA, double * data, int * na_count,
1700- inference_data_t inference_data) nogil:
1686+ bint na_filter, kh_str_t * na_hashset, bint use_na_flist,
1687+ const kh_float64_t * na_flist,
1688+ double NA, double * data, int * na_count,
1689+ inference_data_t inference_data) nogil:
17011690 cdef:
17021691 int error,
17031692 size_t i
@@ -1783,7 +1772,6 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
17831772 lines = line_end - line_start
17841773 result = np.empty(lines, dtype = np.int64)
17851774 data = < int64_t * > result.data
1786- # compile time
17871775 with nogil:
17881776 error = _try_int64_nogil(parser, col, line_start, line_end, na_filter,
17891777 na_hashset, NA, data, & na_count, inference_data)
@@ -2104,7 +2092,6 @@ def _concatenate_chunks(list chunks):
21042092
21052093 if is_categorical_dtype(dtypes.pop()):
21062094 result[name] = union_categoricals(arrs)
2107- # np.concatenate([c.codes for c in arrs])
21082095 else :
21092096 result[name] = np.concatenate(arrs)
21102097
0 commit comments