@@ -11,6 +11,9 @@ from cpython.datetime cimport datetime
1111
1212import numpy as np
1313
14+ import six
15+ from six import binary_type, text_type
16+
1417# Avoid import from outside _libs
1518if sys.version_info.major == 2 :
1619 from StringIO import StringIO
@@ -531,21 +534,84 @@ def try_parse_datetime_components(object[:] years,
531534# ----------------------------------------------------------------------
532535# Miscellaneous
533536
534- _DATEUTIL_LEXER_SPLIT = None
535- try :
536- # Since these are private methods from dateutil, it is safely imported
537- # here so in case this interface changes, pandas will just fallback
538- # to not using the functionality
539- from dateutil.parser import _timelex
540-
541- if hasattr (_timelex, ' split' ):
542- def _lexer_split_from_str (dt_str ):
543- # The StringIO(str(_)) is for dateutil 2.2 compatibility
544- return _timelex.split(StringIO(str (dt_str)))
545537
546- _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
547- except (ImportError , AttributeError ):
548- pass
538+ # Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
539+ #
540+ # Copyright (c) 2017 - Paul Ganssle <paul@ganssle.io>
541+ # Copyright (c) 2017 - dateutil contributors
542+ class _timelex (object ):
543+ def __init__ (self , instream ):
544+ if six.PY2:
545+ # In Python 2, we can't duck type properly because unicode has
546+ # a 'decode' function, and we'd be double-decoding
547+ if isinstance (instream, (binary_type, bytearray)):
548+ instream = instream.decode()
549+ else :
550+ if getattr (instream, ' decode' , None ) is not None :
551+ instream = instream.decode()
552+
553+ if isinstance (instream, text_type):
554+ self .stream = instream
555+ elif getattr (instream, ' read' , None ) is None :
556+ raise TypeError (
557+ ' Parser must be a string or character stream, not '
558+ ' {itype}' .format(itype = instream.__class__ .__name__ ))
559+ else :
560+ self .stream = instream.read()
561+
562+ def get_tokens (self ):
563+ """
564+ This function breaks the time string into lexical units (tokens), which
565+ can be parsed by the parser. Lexical units are demarcated by changes in
566+ the character set, so any continuous string of letters is considered
567+ one unit, any continuous string of numbers is considered one unit.
568+ The main complication arises from the fact that dots ('.') can be used
569+ both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
570+ "4:30:21.447"). As such, it is necessary to read the full context of
571+ any dot-separated strings before breaking it into tokens; as such, this
572+ function maintains a "token stack", for when the ambiguous context
573+ demands that multiple tokens be parsed at once.
574+ """
575+ stream = self .stream.replace(' \x00 ' , ' ' )
576+
577+ # TODO: Change \s --> \s+ (this doesn't match existing behavior)
578+ # TODO: change the punctuation block to punc+ (doesnt match existing)
579+ # TODO: can we merge the two digit patterns?
580+ tokens = re.findall(' \s|'
581+ ' (?<![\.\d])\d+\.\d+(?![\.\d])'
582+ ' |\d+'
583+ ' |[a-zA-Z]+'
584+ ' |[\./:]+'
585+ ' |[^\da-zA-Z\./:\s]+' , stream)
586+
587+ # Re-combine token tuples of the form ["59", ",", "456"] because
588+ # in this context the "," is treated as a decimal
589+ # (e.g. in python's default logging format)
590+ for n, token in enumerate (tokens[:- 2 ]):
591+ # Kludge to match ,-decimal behavior; it'd be better to do this
592+ # later in the process and have a simpler tokenization
593+ if (token is not None and token.isdigit() and
594+ tokens[n + 1 ] == ' ,' and tokens[n + 2 ].isdigit()):
595+ # Have to check None b/c it might be replaced during the loop
596+ # TODO: I _really_ don't faking the value here
597+ tokens[n] = token + ' .' + tokens[n + 2 ]
598+ tokens[n + 1 ] = None
599+ tokens[n + 2 ] = None
600+
601+ tokens = [x for x in tokens if x is not None ]
602+ return tokens
603+
604+ @classmethod
605+ def split (cls , s ):
606+ return cls (s).get_tokens()
607+
608+
609+ def _lexer_split_from_str (dt_str ):
610+ # The StringIO(str(_)) is for dateutil 2.2 compatibility
611+ return _timelex.split(StringIO(str (dt_str)))
612+
613+
614+ _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
549615
550616
551617def _format_is_iso (f ) -> bint:
0 commit comments