MAINT: Port _timelex in codebase (#24520)

gfyoung · jreback · commit a16c29b6f7c1 · 2019-01-01T11:17:16.000-05:00
* MAINT: Port _timelex in codebase Removes the DeprecationWarning raised by dateutil because it's a private class. Implementation taken from the following PR: dateutil/dateutil#732 Closes gh-21322. * MAINT: Drop _timelex compat for dateutil 2.2
diff --git a/LICENSES/DATEUTIL_LICENSE b/LICENSES/DATEUTIL_LICENSE
@@ -0,0 +1,54 @@
+Copyright 2017- Paul Ganssle <paul@ganssle.io>
+Copyright 2017- dateutil contributors (see AUTHORS file)
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+The above license applies to all contributions after 2017-12-01, as well as
+all contributions that have been re-licensed (see AUTHORS file for the list of
+contributors who have re-licensed their code).
+--------------------------------------------------------------------------------
+dateutil - Extensions to the standard Python datetime module.
+
+Copyright (c) 2003-2011 - Gustavo Niemeyer <gustavo@niemeyer.net>
+Copyright (c) 2012-2014 - Tomi Pieviläinen <tomi.pievilainen@iki.fi>
+Copyright (c) 2014-2016 - Yaron de Leeuw <me@jarondl.net>
+Copyright (c) 2015-     - Paul Ganssle <paul@ganssle.io>
+Copyright (c) 2015-     - dateutil contributors (see AUTHORS file)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The above BSD License Applies to all code, even that also covered by Apache 2.0.
diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -11,6 +11,9 @@ from cpython.datetime cimport datetime
 
 import numpy as np
 
+import six
+from six import binary_type, text_type
+
 # Avoid import from outside _libs
 if sys.version_info.major == 2:
     from StringIO import StringIO
@@ -531,21 +534,83 @@ def try_parse_datetime_components(object[:] years,
 # ----------------------------------------------------------------------
 # Miscellaneous
 
-_DATEUTIL_LEXER_SPLIT = None
-try:
-    # Since these are private methods from dateutil, it is safely imported
-    # here so in case this interface changes, pandas will just fallback
-    # to not using the functionality
-    from dateutil.parser import _timelex
-
-    if hasattr(_timelex, 'split'):
-        def _lexer_split_from_str(dt_str):
-            # The StringIO(str(_)) is for dateutil 2.2 compatibility
-            return _timelex.split(StringIO(str(dt_str)))
 
-        _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
-except (ImportError, AttributeError):
-    pass
+# Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
+#
+# We use this class to parse and tokenize date strings. However, as it is
+# a private class in the dateutil library, relying on backwards compatibility
+# is not practical. In fact, using this class issues warnings (xref gh-21322).
+# Thus, we port the class over so that both issues are resolved.
+#
+# Copyright (c) 2017 - dateutil contributors
+class _timelex(object):
+    def __init__(self, instream):
+        if six.PY2:
+            # In Python 2, we can't duck type properly because unicode has
+            # a 'decode' function, and we'd be double-decoding
+            if isinstance(instream, (binary_type, bytearray)):
+                instream = instream.decode()
+        else:
+            if getattr(instream, 'decode', None) is not None:
+                instream = instream.decode()
+
+        if isinstance(instream, text_type):
+            self.stream = instream
+        elif getattr(instream, 'read', None) is None:
+            raise TypeError(
+                'Parser must be a string or character stream, not '
+                '{itype}'.format(itype=instream.__class__.__name__))
+        else:
+            self.stream = instream.read()
+
+    def get_tokens(self):
+        """
+        This function breaks the time string into lexical units (tokens), which
+        can be parsed by the parser. Lexical units are demarcated by changes in
+        the character set, so any continuous string of letters is considered
+        one unit, any continuous string of numbers is considered one unit.
+        The main complication arises from the fact that dots ('.') can be used
+        both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
+        "4:30:21.447"). As such, it is necessary to read the full context of
+        any dot-separated strings before breaking it into tokens; as such, this
+        function maintains a "token stack", for when the ambiguous context
+        demands that multiple tokens be parsed at once.
+        """
+        stream = self.stream.replace('\x00', '')
+
+        # TODO: Change \s --> \s+ (this doesn't match existing behavior)
+        # TODO: change the punctuation block to punc+ (doesnt match existing)
+        # TODO: can we merge the two digit patterns?
+        tokens = re.findall('\s|'
+                            '(?<![\.\d])\d+\.\d+(?![\.\d])'
+                            '|\d+'
+                            '|[a-zA-Z]+'
+                            '|[\./:]+'
+                            '|[^\da-zA-Z\./:\s]+', stream)
+
+        # Re-combine token tuples of the form ["59", ",", "456"] because
+        # in this context the "," is treated as a decimal
+        # (e.g. in python's default logging format)
+        for n, token in enumerate(tokens[:-2]):
+            # Kludge to match ,-decimal behavior; it'd be better to do this
+            # later in the process and have a simpler tokenization
+            if (token is not None and token.isdigit() and
+                    tokens[n + 1] == ',' and tokens[n + 2].isdigit()):
+                # Have to check None b/c it might be replaced during the loop
+                # TODO: I _really_ don't faking the value here
+                tokens[n] = token + '.' + tokens[n + 2]
+                tokens[n + 1] = None
+                tokens[n + 2] = None
+
+        tokens = [x for x in tokens if x is not None]
+        return tokens
+
+    @classmethod
+    def split(cls, s):
+        return cls(s).get_tokens()
+
+
+_DATEUTIL_LEXER_SPLIT = _timelex.split
 
 
 def _format_is_iso(f) -> bint:
diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py
@@ -1243,8 +1243,6 @@ def test_dayfirst(self, cache):
 class TestGuessDatetimeFormat(object):
 
     @td.skip_if_not_us_locale
-    @pytest.mark.filterwarnings("ignore:_timelex:DeprecationWarning")
-    # https://github.com/pandas-dev/pandas/issues/21322
     def test_guess_datetime_format_for_array(self):
         expected_format = '%Y-%m-%d %H:%M:%S.%f'
         dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format)
diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py
@@ -94,7 +94,6 @@ def test_parsers_monthfreq(self):
             assert result1 == expected
 
 
-@pytest.mark.filterwarnings("ignore:_timelex:DeprecationWarning")
 class TestGuessDatetimeFormat(object):
 
     @td.skip_if_not_us_locale
@@ -163,8 +162,6 @@ def test_guess_datetime_format_invalid_inputs(self):
             ('2011-1-1 00:00:00', '%Y-%m-%d %H:%M:%S'),
             ('2011-1-1 0:0:0', '%Y-%m-%d %H:%M:%S'),
             ('2011-1-3T00:00:0', '%Y-%m-%dT%H:%M:%S')])
-    # https://github.com/pandas-dev/pandas/issues/21322 for _timelex
-    @pytest.mark.filterwarnings("ignore:_timelex:DeprecationWarning")
     def test_guess_datetime_format_nopadding(self, string, format):
         # GH 11142
         result = parsing._guess_datetime_format(string)