diff --git a/LICENSE b/LICENSE index 3fee963db7499..44983fd1259e5 100644 --- a/LICENSE +++ b/LICENSE @@ -221,6 +221,7 @@ connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaR Python Software Foundation License ---------------------------------- +python/pyspark/loose_version.py python/docs/source/_static/copybutton.js BSD 3-Clause diff --git a/LICENSE-binary b/LICENSE-binary index f0f59e7d57644..30fca96a8832d 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -421,6 +421,12 @@ This section summarizes those components and their licenses. See licenses-binary for text of these licenses. +Python Software Foundation License +---------------------------------- + +python/pyspark/loose_version.py + + BSD 2-Clause ------------ diff --git a/dev/.rat-excludes b/dev/.rat-excludes index d8fc418c3146b..d755c0ff749c4 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -142,3 +142,4 @@ empty.proto LimitedInputStream.java TimSort.java xml-resources/* +loose_version.py diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index b8bca7776dd5c..a9a2a31702562 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -49,11 +49,6 @@ from functools import wraps import types from typing import cast, Any, Callable, Optional, TypeVar, Union -from warnings import filterwarnings - -filterwarnings( - "ignore", message="distutils Version classes are deprecated. Use packaging.version instead." -) from pyspark.conf import SparkConf from pyspark.rdd import RDD, RDDBarrier diff --git a/python/pyspark/loose_version.py b/python/pyspark/loose_version.py new file mode 100644 index 0000000000000..11c27f4ead8f0 --- /dev/null +++ b/python/pyspark/loose_version.py @@ -0,0 +1,75 @@ +# Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +# https://github.com/python/cpython/blob/3.11/LICENSE +# File originates from the cpython source +# https://github.com/python/cpython/blob/3.11/Lib/distutils/version.py + +import re +from typing import Optional + + +class LooseVersion: + component_re = re.compile(r"(\d+ | [a-z]+ | \.)", re.VERBOSE) + + def __init__(self, vstring: Optional[str]) -> None: + if vstring: + self.parse(vstring) + + def parse(self, vstring: str) -> None: + self.vstring = vstring + components = [x for x in self.component_re.split(vstring) if x and x != "."] + for i, obj in enumerate(components): + try: + components[i] = int(obj) + except ValueError: + pass + + self.version = components + + def __str__(self) -> str: + return self.vstring + + def __repr__(self) -> str: + return "LooseVersion ('%s')" % str(self) + + def __eq__(self, other): # type: ignore[no-untyped-def] + c = self._cmp(other) + if c is NotImplemented: + return c + return c == 0 + + def __lt__(self, other): # type: ignore[no-untyped-def] + c = self._cmp(other) + if c is NotImplemented: + return c + return c < 0 + + def __le__(self, other): # type: ignore[no-untyped-def] + c = self._cmp(other) + if c is NotImplemented: + return c + return c <= 0 + + def __gt__(self, other): # type: ignore[no-untyped-def] + c = self._cmp(other) + if c is NotImplemented: + return c + return c > 0 + + def __ge__(self, other): # type: ignore[no-untyped-def] + c = self._cmp(other) + if c is NotImplemented: + return c + return c >= 0 + + def _cmp(self, other): # type: ignore[no-untyped-def] + if isinstance(other, str): + other = LooseVersion(other) + elif not isinstance(other, LooseVersion): + return NotImplemented + + if self.version == other.version: + return 0 + if self.version < other.version: + return -1 + if self.version > other.version: + return 1 diff --git a/python/pyspark/pandas/plot/matplotlib.py b/python/pyspark/pandas/plot/matplotlib.py index 0164ec9f980e4..42f30ebf7ae08 100644 --- a/python/pyspark/pandas/plot/matplotlib.py +++ b/python/pyspark/pandas/plot/matplotlib.py @@ -15,7 +15,7 @@ # limitations under the License. # -from distutils.version import LooseVersion +from pyspark.loose_version import LooseVersion import matplotlib as mat import numpy as np diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py index f00757fe36678..a43ad19801181 100644 --- a/python/pyspark/pandas/supported_api_gen.py +++ b/python/pyspark/pandas/supported_api_gen.py @@ -19,7 +19,7 @@ Generate 'Supported pandas APIs' documentation file """ import warnings -from distutils.version import LooseVersion +from pyspark.loose_version import LooseVersion from enum import Enum, unique from inspect import getmembers, isclass, isfunction, signature from typing import Any, Callable, Dict, List, NamedTuple, Set, TextIO, Tuple diff --git a/python/pyspark/pandas/tests/computation/test_any_all.py b/python/pyspark/pandas/tests/computation/test_any_all.py index 64f293c48d64a..6c120aead4e65 100644 --- a/python/pyspark/pandas/tests/computation/test_any_all.py +++ b/python/pyspark/pandas/tests/computation/test_any_all.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import numpy as np diff --git a/python/pyspark/pandas/tests/computation/test_corrwith.py b/python/pyspark/pandas/tests/computation/test_corrwith.py index 4db61c1585430..b64bf2d411b25 100644 --- a/python/pyspark/pandas/tests/computation/test_corrwith.py +++ b/python/pyspark/pandas/tests/computation/test_corrwith.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest diff --git a/python/pyspark/pandas/tests/computation/test_cov.py b/python/pyspark/pandas/tests/computation/test_cov.py index 979f0b73839c3..23e5ec587e9a9 100644 --- a/python/pyspark/pandas/tests/computation/test_cov.py +++ b/python/pyspark/pandas/tests/computation/test_cov.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import decimal diff --git a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py index d4e44afba911d..33332503943d3 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py @@ -17,7 +17,6 @@ import datetime import unittest -from distutils.version import LooseVersion import pandas as pd from pandas.api.types import CategoricalDtype diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py b/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py index d3b36dab4514d..1de0a61ab4b5e 100644 --- a/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +++ b/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py @@ -14,12 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import pandas as pd import numpy as np from pyspark import pandas as ps +from pyspark.loose_version import LooseVersion from pyspark.pandas.config import set_option, reset_option from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.sqlutils import SQLTestUtils diff --git a/python/pyspark/pandas/tests/groupby/test_aggregate.py b/python/pyspark/pandas/tests/groupby/test_aggregate.py index c2d7872c8434f..9e7f3f6cd113d 100644 --- a/python/pyspark/pandas/tests/groupby/test_aggregate.py +++ b/python/pyspark/pandas/tests/groupby/test_aggregate.py @@ -15,7 +15,6 @@ # limitations under the License. # import unittest -from distutils.version import LooseVersion import pandas as pd diff --git a/python/pyspark/pandas/tests/groupby/test_apply_func.py b/python/pyspark/pandas/tests/groupby/test_apply_func.py index da6337b0ca958..e7a30ff57b414 100644 --- a/python/pyspark/pandas/tests/groupby/test_apply_func.py +++ b/python/pyspark/pandas/tests/groupby/test_apply_func.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import numpy as np diff --git a/python/pyspark/pandas/tests/groupby/test_head_tail.py b/python/pyspark/pandas/tests/groupby/test_head_tail.py index 1a22db74f26f4..6f69b0bdee003 100644 --- a/python/pyspark/pandas/tests/groupby/test_head_tail.py +++ b/python/pyspark/pandas/tests/groupby/test_head_tail.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import numpy as np diff --git a/python/pyspark/pandas/tests/groupby/test_index.py b/python/pyspark/pandas/tests/groupby/test_index.py index 9c73e59eabf04..9219a65d2cb46 100644 --- a/python/pyspark/pandas/tests/groupby/test_index.py +++ b/python/pyspark/pandas/tests/groupby/test_index.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import pandas as pd diff --git a/python/pyspark/pandas/tests/groupby/test_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply.py index a3ef8c73de44a..e8648c9b0a84f 100644 --- a/python/pyspark/pandas/tests/groupby/test_split_apply.py +++ b/python/pyspark/pandas/tests/groupby/test_split_apply.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import pandas as pd diff --git a/python/pyspark/pandas/tests/groupby/test_stat.py b/python/pyspark/pandas/tests/groupby/test_stat.py index 695d079db49da..29991ae1d54c4 100644 --- a/python/pyspark/pandas/tests/groupby/test_stat.py +++ b/python/pyspark/pandas/tests/groupby/test_stat.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import numpy as np diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index ccdb575ff702d..e84ab60f1216f 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -17,13 +17,13 @@ import inspect import unittest -from distutils.version import LooseVersion from datetime import datetime, timedelta import numpy as np import pandas as pd import pyspark.pandas as ps +from pyspark.loose_version import LooseVersion from pyspark.pandas.exceptions import PandasNotImplementedError from pyspark.pandas.missing.indexes import ( MissingPandasLikeDatetimeIndex, diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index 5b6bd2cad078e..0cd89711beedb 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -16,12 +16,12 @@ # import unittest -from distutils.version import LooseVersion import pandas as pd from pandas.api.types import CategoricalDtype import pyspark.pandas as ps +from pyspark.loose_version import LooseVersion from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py index 4eaefb514d917..d89e448dd4f9d 100644 --- a/python/pyspark/pandas/tests/indexes/test_datetime.py +++ b/python/pyspark/pandas/tests/indexes/test_datetime.py @@ -17,8 +17,6 @@ import datetime -from distutils.version import LooseVersion - import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/indexes/test_indexing.py b/python/pyspark/pandas/tests/indexes/test_indexing.py index 111dd09696d79..3facb1929b4fe 100644 --- a/python/pyspark/pandas/tests/indexes/test_indexing.py +++ b/python/pyspark/pandas/tests/indexes/test_indexing.py @@ -15,7 +15,6 @@ # limitations under the License. # import unittest -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/indexes/test_reindex.py b/python/pyspark/pandas/tests/indexes/test_reindex.py index 26eb97fdb552e..1d544ea221bf9 100644 --- a/python/pyspark/pandas/tests/indexes/test_reindex.py +++ b/python/pyspark/pandas/tests/indexes/test_reindex.py @@ -15,7 +15,6 @@ # limitations under the License. # import unittest -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/indexes/test_timedelta.py b/python/pyspark/pandas/tests/indexes/test_timedelta.py index 5321f96eeaba7..6bab794f3abbb 100644 --- a/python/pyspark/pandas/tests/indexes/test_timedelta.py +++ b/python/pyspark/pandas/tests/indexes/test_timedelta.py @@ -17,7 +17,6 @@ import unittest from datetime import timedelta -from distutils.version import LooseVersion import pandas as pd diff --git a/python/pyspark/pandas/tests/series/test_compute.py b/python/pyspark/pandas/tests/series/test_compute.py index 3f6a24708db52..9e48de893a130 100644 --- a/python/pyspark/pandas/tests/series/test_compute.py +++ b/python/pyspark/pandas/tests/series/test_compute.py @@ -15,7 +15,6 @@ # limitations under the License. # import unittest -from distutils.version import LooseVersion from itertools import product import numpy as np diff --git a/python/pyspark/pandas/tests/test_csv.py b/python/pyspark/pandas/tests/test_csv.py index a367dd72be1e5..a62388050472c 100644 --- a/python/pyspark/pandas/tests/test_csv.py +++ b/python/pyspark/pandas/tests/test_csv.py @@ -20,7 +20,6 @@ import tempfile import unittest from contextlib import contextmanager -from distutils.version import LooseVersion import pandas as pd import numpy as np diff --git a/python/pyspark/pandas/tests/test_dataframe_conversion.py b/python/pyspark/pandas/tests/test_dataframe_conversion.py index 283849a06d58a..134cf8bd1c103 100644 --- a/python/pyspark/pandas/tests/test_dataframe_conversion.py +++ b/python/pyspark/pandas/tests/test_dataframe_conversion.py @@ -21,7 +21,6 @@ import tempfile import unittest import sys -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/test_dataframe_spark_io.py index 1667524910b9b..41be0eee4b864 100644 --- a/python/pyspark/pandas/tests/test_dataframe_spark_io.py +++ b/python/pyspark/pandas/tests/test_dataframe_spark_io.py @@ -18,12 +18,12 @@ import unittest import glob import os -from distutils.version import LooseVersion import numpy as np import pandas as pd from pyspark import pandas as ps +from pyspark.loose_version import LooseVersion from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index f39a3c4a0abc0..12f81a2e8588e 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -15,7 +15,6 @@ # limitations under the License. # -from distutils.version import LooseVersion from itertools import product import unittest diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py index 5a8b1e3792016..0b8fe26cb8381 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py @@ -16,7 +16,6 @@ # import unittest -from distutils.version import LooseVersion import pandas as pd diff --git a/python/pyspark/pandas/tests/test_rolling.py b/python/pyspark/pandas/tests/test_rolling.py index 526962e3bbdd2..c7e49eab5bb5e 100644 --- a/python/pyspark/pandas/tests/test_rolling.py +++ b/python/pyspark/pandas/tests/test_rolling.py @@ -15,7 +15,6 @@ # limitations under the License. # import unittest -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/test_series_conversion.py b/python/pyspark/pandas/tests/test_series_conversion.py index b0a97b0a6f811..926c641ebc9c7 100644 --- a/python/pyspark/pandas/tests/test_series_conversion.py +++ b/python/pyspark/pandas/tests/test_series_conversion.py @@ -17,7 +17,6 @@ import unittest import sys -from distutils.version import LooseVersion import pandas as pd diff --git a/python/pyspark/pandas/tests/test_series_datetime.py b/python/pyspark/pandas/tests/test_series_datetime.py index c7ffc0675c6d8..89d4b70e0b51c 100644 --- a/python/pyspark/pandas/tests/test_series_datetime.py +++ b/python/pyspark/pandas/tests/test_series_datetime.py @@ -17,7 +17,6 @@ import datetime import unittest -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/test_series_string.py b/python/pyspark/pandas/tests/test_series_string.py index 93c6473f7d37c..b8d35764f1bc8 100644 --- a/python/pyspark/pandas/tests/test_series_string.py +++ b/python/pyspark/pandas/tests/test_series_string.py @@ -19,7 +19,6 @@ import numpy as np import re import unittest -from distutils.version import LooseVersion from pyspark import pandas as ps from pyspark.testing.pandasutils import PandasOnSparkTestCase diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py index 1f5b6a732a566..40ee64a5f68d1 100644 --- a/python/pyspark/pandas/tests/test_stats.py +++ b/python/pyspark/pandas/tests/test_stats.py @@ -16,7 +16,6 @@ # import unittest -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py index 817a92b8faa7e..db7f8e6dc75c9 100644 --- a/python/pyspark/sql/connect/client/core.py +++ b/python/pyspark/sql/connect/client/core.py @@ -19,6 +19,7 @@ "SparkConnectClient", ] +from pyspark.loose_version import LooseVersion from pyspark.sql.connect.utils import check_dependencies check_dependencies(__name__) @@ -31,7 +32,6 @@ import urllib.parse import uuid import sys -from distutils.version import LooseVersion from types import TracebackType from typing import ( Iterable, diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py index e5d1d95a69967..53bf19b78c897 100644 --- a/python/pyspark/sql/connect/session.py +++ b/python/pyspark/sql/connect/session.py @@ -22,7 +22,6 @@ import os import warnings from collections.abc import Sized -from distutils.version import LooseVersion from functools import reduce from threading import RLock from typing import ( @@ -50,6 +49,7 @@ import urllib from pyspark import SparkContext, SparkConf, __version__ +from pyspark.loose_version import LooseVersion from pyspark.sql.connect.client import SparkConnectClient, ChannelBuilder from pyspark.sql.connect.conf import RuntimeConf from pyspark.sql.connect.dataframe import DataFrame diff --git a/python/pyspark/sql/connect/utils.py b/python/pyspark/sql/connect/utils.py index 8872ba50633cd..e96529e44f8a3 100644 --- a/python/pyspark/sql/connect/utils.py +++ b/python/pyspark/sql/connect/utils.py @@ -16,6 +16,7 @@ # import sys +from pyspark.loose_version import LooseVersion from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version @@ -39,8 +40,6 @@ def require_minimum_grpc_version() -> None: """Raise ImportError if minimum version of grpc is not installed""" minimum_grpc_version = "1.48.1" - from distutils.version import LooseVersion - try: import grpc except ImportError as error: diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index abbc9f9441f06..5288f0e100bb2 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -26,9 +26,9 @@ TYPE_CHECKING, ) from warnings import warn -from distutils.version import LooseVersion from pyspark.errors.exceptions.captured import unwrap_spark_exception +from pyspark.loose_version import LooseVersion from pyspark.rdd import _load_from_socket from pyspark.sql.pandas.serializers import ArrowCollectSerializer from pyspark.sql.pandas.types import _dedup_names diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py index 9aa2be96add63..4c1d1c177d638 100644 --- a/python/pyspark/sql/pandas/serializers.py +++ b/python/pyspark/sql/pandas/serializers.py @@ -20,6 +20,7 @@ """ from pyspark.errors import PySparkRuntimeError, PySparkTypeError, PySparkValueError +from pyspark.loose_version import LooseVersion from pyspark.serializers import Serializer, read_int, write_int, UTF8Deserializer, CPickleSerializer from pyspark.sql.pandas.types import ( from_arrow_type, @@ -188,7 +189,6 @@ def arrow_to_pandas(self, arrow_column, struct_in_pandas="dict", ndarray_as_list pandas_options = {"date_as_object": True} import pyarrow as pa - from distutils.version import LooseVersion if LooseVersion(pa.__version__) >= LooseVersion("13.0.0"): # A legacy option to coerce date32, date64, duration, and timestamp diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py index c7504f901e629..b62be20810288 100644 --- a/python/pyspark/sql/pandas/utils.py +++ b/python/pyspark/sql/pandas/utils.py @@ -15,14 +15,14 @@ # limitations under the License. # +from pyspark.loose_version import LooseVersion + def require_minimum_pandas_version() -> None: """Raise ImportError if minimum version of Pandas is not installed""" # TODO(HyukjinKwon): Relocate and deduplicate the version specification. minimum_pandas_version = "1.4.4" - from distutils.version import LooseVersion - try: import pandas @@ -46,7 +46,6 @@ def require_minimum_pyarrow_version() -> None: # TODO(HyukjinKwon): Relocate and deduplicate the version specification. minimum_pyarrow_version = "4.0.0" - from distutils.version import LooseVersion import os try: