@@ -11,6 +11,9 @@ from cpython.datetime cimport datetime
1111
1212import numpy as np
1313
14+ import six
15+ from six import binary_type, text_type
16+
1417# Avoid import from outside _libs
1518if sys.version_info.major == 2 :
1619 from StringIO import StringIO
@@ -531,21 +534,83 @@ def try_parse_datetime_components(object[:] years,
531534# ----------------------------------------------------------------------
532535# Miscellaneous
533536
534- _DATEUTIL_LEXER_SPLIT = None
535- try :
536- # Since these are private methods from dateutil, it is safely imported
537- # here so in case this interface changes, pandas will just fallback
538- # to not using the functionality
539- from dateutil.parser import _timelex
540-
541- if hasattr (_timelex, ' split' ):
542- def _lexer_split_from_str (dt_str ):
543- # The StringIO(str(_)) is for dateutil 2.2 compatibility
544- return _timelex.split(StringIO(str (dt_str)))
545537
546- _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
547- except (ImportError , AttributeError ):
548- pass
538+ # Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
539+ #
540+ # We use this class to parse and tokenize date strings. However, as it is
541+ # a private class in the dateutil library, relying on backwards compatibility
542+ # is not practical. In fact, using this class issues warnings (xref gh-21322).
543+ # Thus, we port the class over so that both issues are resolved.
544+ #
545+ # Copyright (c) 2017 - dateutil contributors
546+ class _timelex (object ):
547+ def __init__ (self , instream ):
548+ if six.PY2:
549+ # In Python 2, we can't duck type properly because unicode has
550+ # a 'decode' function, and we'd be double-decoding
551+ if isinstance (instream, (binary_type, bytearray)):
552+ instream = instream.decode()
553+ else :
554+ if getattr (instream, ' decode' , None ) is not None :
555+ instream = instream.decode()
556+
557+ if isinstance (instream, text_type):
558+ self .stream = instream
559+ elif getattr (instream, ' read' , None ) is None :
560+ raise TypeError (
561+ ' Parser must be a string or character stream, not '
562+ ' {itype}' .format(itype = instream.__class__ .__name__ ))
563+ else :
564+ self .stream = instream.read()
565+
566+ def get_tokens (self ):
567+ """
568+ This function breaks the time string into lexical units (tokens), which
569+ can be parsed by the parser. Lexical units are demarcated by changes in
570+ the character set, so any continuous string of letters is considered
571+ one unit, any continuous string of numbers is considered one unit.
572+ The main complication arises from the fact that dots ('.') can be used
573+ both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
574+ "4:30:21.447"). As such, it is necessary to read the full context of
575+ any dot-separated strings before breaking it into tokens; as such, this
576+ function maintains a "token stack", for when the ambiguous context
577+ demands that multiple tokens be parsed at once.
578+ """
579+ stream = self .stream.replace(' \x00 ' , ' ' )
580+
581+ # TODO: Change \s --> \s+ (this doesn't match existing behavior)
582+ # TODO: change the punctuation block to punc+ (doesnt match existing)
583+ # TODO: can we merge the two digit patterns?
584+ tokens = re.findall(' \s|'
585+ ' (?<![\.\d])\d+\.\d+(?![\.\d])'
586+ ' |\d+'
587+ ' |[a-zA-Z]+'
588+ ' |[\./:]+'
589+ ' |[^\da-zA-Z\./:\s]+' , stream)
590+
591+ # Re-combine token tuples of the form ["59", ",", "456"] because
592+ # in this context the "," is treated as a decimal
593+ # (e.g. in python's default logging format)
594+ for n, token in enumerate (tokens[:- 2 ]):
595+ # Kludge to match ,-decimal behavior; it'd be better to do this
596+ # later in the process and have a simpler tokenization
597+ if (token is not None and token.isdigit() and
598+ tokens[n + 1 ] == ' ,' and tokens[n + 2 ].isdigit()):
599+ # Have to check None b/c it might be replaced during the loop
600+ # TODO: I _really_ don't faking the value here
601+ tokens[n] = token + ' .' + tokens[n + 2 ]
602+ tokens[n + 1 ] = None
603+ tokens[n + 2 ] = None
604+
605+ tokens = [x for x in tokens if x is not None ]
606+ return tokens
607+
608+ @classmethod
609+ def split (cls , s ):
610+ return cls (s).get_tokens()
611+
612+
613+ _DATEUTIL_LEXER_SPLIT = _timelex.split
549614
550615
551616def _format_is_iso (f ) -> bint:
0 commit comments