From adfd9268fe4f7f859085efa6bf70a7dab2569f7b Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 29 Sep 2023 15:01:59 -0700 Subject: [PATCH 1/3] [SPARK-45390][PYTHON] Remove `distutils` usage --- LICENSE | 1 + LICENSE-binary | 6 ++ dev/.rat-excludes | 1 + python/pyspark/__init__.py | 4 - python/pyspark/loose_version.py | 78 +++++++++++++++++++ python/pyspark/pandas/plot/matplotlib.py | 2 +- python/pyspark/pandas/supported_api_gen.py | 2 +- .../pandas/tests/computation/test_any_all.py | 1 - .../pandas/tests/computation/test_corrwith.py | 1 - .../pandas/tests/computation/test_cov.py | 1 - .../tests/data_type_ops/test_date_ops.py | 1 - .../diff_frames_ops/test_cov_corrwith.py | 2 +- .../pandas/tests/groupby/test_aggregate.py | 1 - .../pandas/tests/groupby/test_apply_func.py | 1 - .../pandas/tests/groupby/test_head_tail.py | 1 - .../pandas/tests/groupby/test_index.py | 1 - .../pandas/tests/groupby/test_split_apply.py | 1 - .../pyspark/pandas/tests/groupby/test_stat.py | 1 - .../pyspark/pandas/tests/indexes/test_base.py | 2 +- .../pandas/tests/indexes/test_category.py | 2 +- .../pandas/tests/indexes/test_datetime.py | 2 - .../pandas/tests/indexes/test_indexing.py | 1 - .../pandas/tests/indexes/test_reindex.py | 1 - .../pandas/tests/indexes/test_timedelta.py | 1 - .../pandas/tests/series/test_compute.py | 1 - python/pyspark/pandas/tests/test_csv.py | 1 - .../pandas/tests/test_dataframe_conversion.py | 1 - .../pandas/tests/test_dataframe_spark_io.py | 2 +- .../pandas/tests/test_ops_on_diff_frames.py | 1 - .../tests/test_ops_on_diff_frames_groupby.py | 1 - python/pyspark/pandas/tests/test_rolling.py | 1 - .../pandas/tests/test_series_conversion.py | 1 - .../pandas/tests/test_series_datetime.py | 1 - .../pandas/tests/test_series_string.py | 1 - python/pyspark/pandas/tests/test_stats.py | 1 - python/pyspark/sql/connect/client/core.py | 2 +- python/pyspark/sql/connect/session.py | 2 +- python/pyspark/sql/connect/utils.py | 3 +- python/pyspark/sql/pandas/conversion.py | 2 +- python/pyspark/sql/pandas/serializers.py | 2 +- python/pyspark/sql/pandas/utils.py | 4 +- 41 files changed, 98 insertions(+), 44 deletions(-) create mode 100644 python/pyspark/loose_version.py diff --git a/LICENSE b/LICENSE index 3fee963db7499..44983fd1259e5 100644 --- a/LICENSE +++ b/LICENSE @@ -221,6 +221,7 @@ connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaR Python Software Foundation License ---------------------------------- +python/pyspark/loose_version.py python/docs/source/_static/copybutton.js BSD 3-Clause diff --git a/LICENSE-binary b/LICENSE-binary index f0f59e7d57644..30fca96a8832d 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -421,6 +421,12 @@ This section summarizes those components and their licenses. See licenses-binary for text of these licenses. +Python Software Foundation License +---------------------------------- + +python/pyspark/loose_version.py + + BSD 2-Clause ------------ diff --git a/dev/.rat-excludes b/dev/.rat-excludes index d8fc418c3146b..d755c0ff749c4 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -142,3 +142,4 @@ empty.proto LimitedInputStream.java TimSort.java xml-resources/* +loose_version.py diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index b8bca7776dd5c..21c4dfce4e3ff 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -51,10 +51,6 @@ from typing import cast, Any, Callable, Optional, TypeVar, Union from warnings import filterwarnings -filterwarnings( - "ignore", message="distutils Version classes are deprecated. Use packaging.version instead." -) - from pyspark.conf import SparkConf from pyspark.rdd import RDD, RDDBarrier from pyspark.files import SparkFiles diff --git a/python/pyspark/loose_version.py b/python/pyspark/loose_version.py new file mode 100644 index 0000000000000..74d5285041f37 --- /dev/null +++ b/python/pyspark/loose_version.py @@ -0,0 +1,78 @@ +import re + +class LooseVersion: + + component_re = re.compile(r'(\d+ | [a-z]+ | \.)', re.VERBOSE) + + def __init__ (self, vstring=None): + if vstring: + self.parse(vstring) + + + def parse (self, vstring): + # I've given up on thinking I can reconstruct the version string + # from the parsed tuple -- so I just store the string here for + # use by __str__ + self.vstring = vstring + components = [x for x in self.component_re.split(vstring) + if x and x != '.'] + for i, obj in enumerate(components): + try: + components[i] = int(obj) + except ValueError: + pass + + self.version = components + + + def __str__ (self): + return self.vstring + + + def __repr__ (self): + return "LooseVersion ('%s')" % str(self) + + + def __eq__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c == 0 + + def __lt__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c < 0 + + def __le__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c <= 0 + + def __gt__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c > 0 + + def __ge__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c >= 0 + + def _cmp (self, other): + if isinstance(other, str): + other = LooseVersion(other) + elif not isinstance(other, LooseVersion): + return NotImplemented + + if self.version == other.version: + return 0 + if self.version < other.version: + return -1 + if self.version > other.version: + return 1 + diff --git a/python/pyspark/pandas/plot/matplotlib.py b/python/pyspark/pandas/plot/matplotlib.py index 0164ec9f980e4..42f30ebf7ae08 100644 --- a/python/pyspark/pandas/plot/matplotlib.py +++ b/python/pyspark/pandas/plot/matplotlib.py @@ -15,7 +15,7 @@ # limitations under the License. # -from distutils.version import LooseVersion +from pyspark.loose_version import LooseVersion import matplotlib as mat import numpy as np diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py index f00757fe36678..a43ad19801181 100644 --- a/python/pyspark/pandas/supported_api_gen.py +++ b/python/pyspark/pandas/supported_api_gen.py @@ -19,7 +19,7 @@ Generate 'Supported pandas APIs' documentation file """ import warnings -from distutils.version import LooseVersion +from pyspark.loose_version import LooseVersion from enum import Enum, unique from inspect import getmembers, isclass, isfunction, signature from typing import Any, Callable, Dict, List, NamedTuple, Set, TextIO, Tuple diff --git a/python/pyspark/pandas/tests/computation/test_any_all.py b/python/pyspark/pandas/tests/computation/test_any_all.py index 64f293c48d64a..6c120aead4e65 100644 --- a/python/pyspark/pandas/tests/computation/test_any_all.py +++ b/python/pyspark/pandas/tests/computation/test_any_all.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import numpy as np diff --git a/python/pyspark/pandas/tests/computation/test_corrwith.py b/python/pyspark/pandas/tests/computation/test_corrwith.py index 4db61c1585430..b64bf2d411b25 100644 --- a/python/pyspark/pandas/tests/computation/test_corrwith.py +++ b/python/pyspark/pandas/tests/computation/test_corrwith.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest diff --git a/python/pyspark/pandas/tests/computation/test_cov.py b/python/pyspark/pandas/tests/computation/test_cov.py index 979f0b73839c3..23e5ec587e9a9 100644 --- a/python/pyspark/pandas/tests/computation/test_cov.py +++ b/python/pyspark/pandas/tests/computation/test_cov.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import decimal diff --git a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py index d4e44afba911d..33332503943d3 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py @@ -17,7 +17,6 @@ import datetime import unittest -from distutils.version import LooseVersion import pandas as pd from pandas.api.types import CategoricalDtype diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py b/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py index d3b36dab4514d..1de0a61ab4b5e 100644 --- a/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py +++ b/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py @@ -14,12 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import pandas as pd import numpy as np from pyspark import pandas as ps +from pyspark.loose_version import LooseVersion from pyspark.pandas.config import set_option, reset_option from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.sqlutils import SQLTestUtils diff --git a/python/pyspark/pandas/tests/groupby/test_aggregate.py b/python/pyspark/pandas/tests/groupby/test_aggregate.py index c2d7872c8434f..9e7f3f6cd113d 100644 --- a/python/pyspark/pandas/tests/groupby/test_aggregate.py +++ b/python/pyspark/pandas/tests/groupby/test_aggregate.py @@ -15,7 +15,6 @@ # limitations under the License. # import unittest -from distutils.version import LooseVersion import pandas as pd diff --git a/python/pyspark/pandas/tests/groupby/test_apply_func.py b/python/pyspark/pandas/tests/groupby/test_apply_func.py index da6337b0ca958..e7a30ff57b414 100644 --- a/python/pyspark/pandas/tests/groupby/test_apply_func.py +++ b/python/pyspark/pandas/tests/groupby/test_apply_func.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import numpy as np diff --git a/python/pyspark/pandas/tests/groupby/test_head_tail.py b/python/pyspark/pandas/tests/groupby/test_head_tail.py index 1a22db74f26f4..6f69b0bdee003 100644 --- a/python/pyspark/pandas/tests/groupby/test_head_tail.py +++ b/python/pyspark/pandas/tests/groupby/test_head_tail.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import numpy as np diff --git a/python/pyspark/pandas/tests/groupby/test_index.py b/python/pyspark/pandas/tests/groupby/test_index.py index 9c73e59eabf04..9219a65d2cb46 100644 --- a/python/pyspark/pandas/tests/groupby/test_index.py +++ b/python/pyspark/pandas/tests/groupby/test_index.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import pandas as pd diff --git a/python/pyspark/pandas/tests/groupby/test_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply.py index a3ef8c73de44a..e8648c9b0a84f 100644 --- a/python/pyspark/pandas/tests/groupby/test_split_apply.py +++ b/python/pyspark/pandas/tests/groupby/test_split_apply.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import pandas as pd diff --git a/python/pyspark/pandas/tests/groupby/test_stat.py b/python/pyspark/pandas/tests/groupby/test_stat.py index 695d079db49da..29991ae1d54c4 100644 --- a/python/pyspark/pandas/tests/groupby/test_stat.py +++ b/python/pyspark/pandas/tests/groupby/test_stat.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from distutils.version import LooseVersion import unittest import numpy as np diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index ccdb575ff702d..e84ab60f1216f 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -17,13 +17,13 @@ import inspect import unittest -from distutils.version import LooseVersion from datetime import datetime, timedelta import numpy as np import pandas as pd import pyspark.pandas as ps +from pyspark.loose_version import LooseVersion from pyspark.pandas.exceptions import PandasNotImplementedError from pyspark.pandas.missing.indexes import ( MissingPandasLikeDatetimeIndex, diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index 5b6bd2cad078e..0cd89711beedb 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -16,12 +16,12 @@ # import unittest -from distutils.version import LooseVersion import pandas as pd from pandas.api.types import CategoricalDtype import pyspark.pandas as ps +from pyspark.loose_version import LooseVersion from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py index 4eaefb514d917..d89e448dd4f9d 100644 --- a/python/pyspark/pandas/tests/indexes/test_datetime.py +++ b/python/pyspark/pandas/tests/indexes/test_datetime.py @@ -17,8 +17,6 @@ import datetime -from distutils.version import LooseVersion - import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/indexes/test_indexing.py b/python/pyspark/pandas/tests/indexes/test_indexing.py index 111dd09696d79..3facb1929b4fe 100644 --- a/python/pyspark/pandas/tests/indexes/test_indexing.py +++ b/python/pyspark/pandas/tests/indexes/test_indexing.py @@ -15,7 +15,6 @@ # limitations under the License. # import unittest -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/indexes/test_reindex.py b/python/pyspark/pandas/tests/indexes/test_reindex.py index 26eb97fdb552e..1d544ea221bf9 100644 --- a/python/pyspark/pandas/tests/indexes/test_reindex.py +++ b/python/pyspark/pandas/tests/indexes/test_reindex.py @@ -15,7 +15,6 @@ # limitations under the License. # import unittest -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/indexes/test_timedelta.py b/python/pyspark/pandas/tests/indexes/test_timedelta.py index 5321f96eeaba7..6bab794f3abbb 100644 --- a/python/pyspark/pandas/tests/indexes/test_timedelta.py +++ b/python/pyspark/pandas/tests/indexes/test_timedelta.py @@ -17,7 +17,6 @@ import unittest from datetime import timedelta -from distutils.version import LooseVersion import pandas as pd diff --git a/python/pyspark/pandas/tests/series/test_compute.py b/python/pyspark/pandas/tests/series/test_compute.py index 3f6a24708db52..9e48de893a130 100644 --- a/python/pyspark/pandas/tests/series/test_compute.py +++ b/python/pyspark/pandas/tests/series/test_compute.py @@ -15,7 +15,6 @@ # limitations under the License. # import unittest -from distutils.version import LooseVersion from itertools import product import numpy as np diff --git a/python/pyspark/pandas/tests/test_csv.py b/python/pyspark/pandas/tests/test_csv.py index a367dd72be1e5..a62388050472c 100644 --- a/python/pyspark/pandas/tests/test_csv.py +++ b/python/pyspark/pandas/tests/test_csv.py @@ -20,7 +20,6 @@ import tempfile import unittest from contextlib import contextmanager -from distutils.version import LooseVersion import pandas as pd import numpy as np diff --git a/python/pyspark/pandas/tests/test_dataframe_conversion.py b/python/pyspark/pandas/tests/test_dataframe_conversion.py index 283849a06d58a..134cf8bd1c103 100644 --- a/python/pyspark/pandas/tests/test_dataframe_conversion.py +++ b/python/pyspark/pandas/tests/test_dataframe_conversion.py @@ -21,7 +21,6 @@ import tempfile import unittest import sys -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/test_dataframe_spark_io.py index 1667524910b9b..41be0eee4b864 100644 --- a/python/pyspark/pandas/tests/test_dataframe_spark_io.py +++ b/python/pyspark/pandas/tests/test_dataframe_spark_io.py @@ -18,12 +18,12 @@ import unittest import glob import os -from distutils.version import LooseVersion import numpy as np import pandas as pd from pyspark import pandas as ps +from pyspark.loose_version import LooseVersion from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index f39a3c4a0abc0..12f81a2e8588e 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -15,7 +15,6 @@ # limitations under the License. # -from distutils.version import LooseVersion from itertools import product import unittest diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py index 5a8b1e3792016..0b8fe26cb8381 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py @@ -16,7 +16,6 @@ # import unittest -from distutils.version import LooseVersion import pandas as pd diff --git a/python/pyspark/pandas/tests/test_rolling.py b/python/pyspark/pandas/tests/test_rolling.py index 526962e3bbdd2..c7e49eab5bb5e 100644 --- a/python/pyspark/pandas/tests/test_rolling.py +++ b/python/pyspark/pandas/tests/test_rolling.py @@ -15,7 +15,6 @@ # limitations under the License. # import unittest -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/test_series_conversion.py b/python/pyspark/pandas/tests/test_series_conversion.py index b0a97b0a6f811..926c641ebc9c7 100644 --- a/python/pyspark/pandas/tests/test_series_conversion.py +++ b/python/pyspark/pandas/tests/test_series_conversion.py @@ -17,7 +17,6 @@ import unittest import sys -from distutils.version import LooseVersion import pandas as pd diff --git a/python/pyspark/pandas/tests/test_series_datetime.py b/python/pyspark/pandas/tests/test_series_datetime.py index c7ffc0675c6d8..89d4b70e0b51c 100644 --- a/python/pyspark/pandas/tests/test_series_datetime.py +++ b/python/pyspark/pandas/tests/test_series_datetime.py @@ -17,7 +17,6 @@ import datetime import unittest -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/pandas/tests/test_series_string.py b/python/pyspark/pandas/tests/test_series_string.py index 93c6473f7d37c..b8d35764f1bc8 100644 --- a/python/pyspark/pandas/tests/test_series_string.py +++ b/python/pyspark/pandas/tests/test_series_string.py @@ -19,7 +19,6 @@ import numpy as np import re import unittest -from distutils.version import LooseVersion from pyspark import pandas as ps from pyspark.testing.pandasutils import PandasOnSparkTestCase diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py index 1f5b6a732a566..40ee64a5f68d1 100644 --- a/python/pyspark/pandas/tests/test_stats.py +++ b/python/pyspark/pandas/tests/test_stats.py @@ -16,7 +16,6 @@ # import unittest -from distutils.version import LooseVersion import numpy as np import pandas as pd diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py index 817a92b8faa7e..db7f8e6dc75c9 100644 --- a/python/pyspark/sql/connect/client/core.py +++ b/python/pyspark/sql/connect/client/core.py @@ -19,6 +19,7 @@ "SparkConnectClient", ] +from pyspark.loose_version import LooseVersion from pyspark.sql.connect.utils import check_dependencies check_dependencies(__name__) @@ -31,7 +32,6 @@ import urllib.parse import uuid import sys -from distutils.version import LooseVersion from types import TracebackType from typing import ( Iterable, diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py index e5d1d95a69967..53bf19b78c897 100644 --- a/python/pyspark/sql/connect/session.py +++ b/python/pyspark/sql/connect/session.py @@ -22,7 +22,6 @@ import os import warnings from collections.abc import Sized -from distutils.version import LooseVersion from functools import reduce from threading import RLock from typing import ( @@ -50,6 +49,7 @@ import urllib from pyspark import SparkContext, SparkConf, __version__ +from pyspark.loose_version import LooseVersion from pyspark.sql.connect.client import SparkConnectClient, ChannelBuilder from pyspark.sql.connect.conf import RuntimeConf from pyspark.sql.connect.dataframe import DataFrame diff --git a/python/pyspark/sql/connect/utils.py b/python/pyspark/sql/connect/utils.py index 8872ba50633cd..e96529e44f8a3 100644 --- a/python/pyspark/sql/connect/utils.py +++ b/python/pyspark/sql/connect/utils.py @@ -16,6 +16,7 @@ # import sys +from pyspark.loose_version import LooseVersion from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version @@ -39,8 +40,6 @@ def require_minimum_grpc_version() -> None: """Raise ImportError if minimum version of grpc is not installed""" minimum_grpc_version = "1.48.1" - from distutils.version import LooseVersion - try: import grpc except ImportError as error: diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index abbc9f9441f06..5288f0e100bb2 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -26,9 +26,9 @@ TYPE_CHECKING, ) from warnings import warn -from distutils.version import LooseVersion from pyspark.errors.exceptions.captured import unwrap_spark_exception +from pyspark.loose_version import LooseVersion from pyspark.rdd import _load_from_socket from pyspark.sql.pandas.serializers import ArrowCollectSerializer from pyspark.sql.pandas.types import _dedup_names diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py index 9aa2be96add63..4c1d1c177d638 100644 --- a/python/pyspark/sql/pandas/serializers.py +++ b/python/pyspark/sql/pandas/serializers.py @@ -20,6 +20,7 @@ """ from pyspark.errors import PySparkRuntimeError, PySparkTypeError, PySparkValueError +from pyspark.loose_version import LooseVersion from pyspark.serializers import Serializer, read_int, write_int, UTF8Deserializer, CPickleSerializer from pyspark.sql.pandas.types import ( from_arrow_type, @@ -188,7 +189,6 @@ def arrow_to_pandas(self, arrow_column, struct_in_pandas="dict", ndarray_as_list pandas_options = {"date_as_object": True} import pyarrow as pa - from distutils.version import LooseVersion if LooseVersion(pa.__version__) >= LooseVersion("13.0.0"): # A legacy option to coerce date32, date64, duration, and timestamp diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py index c7504f901e629..3c72fdca07bbe 100644 --- a/python/pyspark/sql/pandas/utils.py +++ b/python/pyspark/sql/pandas/utils.py @@ -15,14 +15,13 @@ # limitations under the License. # +from pyspark.loose_version import LooseVersion def require_minimum_pandas_version() -> None: """Raise ImportError if minimum version of Pandas is not installed""" # TODO(HyukjinKwon): Relocate and deduplicate the version specification. minimum_pandas_version = "1.4.4" - from distutils.version import LooseVersion - try: import pandas @@ -46,7 +45,6 @@ def require_minimum_pyarrow_version() -> None: # TODO(HyukjinKwon): Relocate and deduplicate the version specification. minimum_pyarrow_version = "4.0.0" - from distutils.version import LooseVersion import os try: From d84d3d3c8a854404f2981dcd382ee2d5347c5239 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sat, 30 Sep 2023 20:24:14 -0700 Subject: [PATCH 2/3] Run dev/reformat-python and remove comments --- python/pyspark/loose_version.py | 25 ++++++++----------------- python/pyspark/sql/pandas/utils.py | 1 + 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/python/pyspark/loose_version.py b/python/pyspark/loose_version.py index 74d5285041f37..f2a4bcf8d25a9 100644 --- a/python/pyspark/loose_version.py +++ b/python/pyspark/loose_version.py @@ -1,21 +1,16 @@ import re -class LooseVersion: - component_re = re.compile(r'(\d+ | [a-z]+ | \.)', re.VERBOSE) +class LooseVersion: + component_re = re.compile(r"(\d+ | [a-z]+ | \.)", re.VERBOSE) - def __init__ (self, vstring=None): + def __init__(self, vstring=None): if vstring: self.parse(vstring) - - def parse (self, vstring): - # I've given up on thinking I can reconstruct the version string - # from the parsed tuple -- so I just store the string here for - # use by __str__ + def parse(self, vstring): self.vstring = vstring - components = [x for x in self.component_re.split(vstring) - if x and x != '.'] + components = [x for x in self.component_re.split(vstring) if x and x != "."] for i, obj in enumerate(components): try: components[i] = int(obj) @@ -24,15 +19,12 @@ def parse (self, vstring): self.version = components - - def __str__ (self): + def __str__(self): return self.vstring - - def __repr__ (self): + def __repr__(self): return "LooseVersion ('%s')" % str(self) - def __eq__(self, other): c = self._cmp(other) if c is NotImplemented: @@ -63,7 +55,7 @@ def __ge__(self, other): return c return c >= 0 - def _cmp (self, other): + def _cmp(self, other): if isinstance(other, str): other = LooseVersion(other) elif not isinstance(other, LooseVersion): @@ -75,4 +67,3 @@ def _cmp (self, other): return -1 if self.version > other.version: return 1 - diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py index 3c72fdca07bbe..b62be20810288 100644 --- a/python/pyspark/sql/pandas/utils.py +++ b/python/pyspark/sql/pandas/utils.py @@ -17,6 +17,7 @@ from pyspark.loose_version import LooseVersion + def require_minimum_pandas_version() -> None: """Raise ImportError if minimum version of Pandas is not installed""" # TODO(HyukjinKwon): Relocate and deduplicate the version specification. From 97f303a1fb489a093213b326deb356030e8fa3fa Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sun, 1 Oct 2023 01:43:57 -0700 Subject: [PATCH 3/3] Address comments --- python/pyspark/__init__.py | 1 - python/pyspark/loose_version.py | 26 ++++++++++++++++---------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index 21c4dfce4e3ff..a9a2a31702562 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -49,7 +49,6 @@ from functools import wraps import types from typing import cast, Any, Callable, Optional, TypeVar, Union -from warnings import filterwarnings from pyspark.conf import SparkConf from pyspark.rdd import RDD, RDDBarrier diff --git a/python/pyspark/loose_version.py b/python/pyspark/loose_version.py index f2a4bcf8d25a9..11c27f4ead8f0 100644 --- a/python/pyspark/loose_version.py +++ b/python/pyspark/loose_version.py @@ -1,14 +1,20 @@ +# Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +# https://github.com/python/cpython/blob/3.11/LICENSE +# File originates from the cpython source +# https://github.com/python/cpython/blob/3.11/Lib/distutils/version.py + import re +from typing import Optional class LooseVersion: component_re = re.compile(r"(\d+ | [a-z]+ | \.)", re.VERBOSE) - def __init__(self, vstring=None): + def __init__(self, vstring: Optional[str]) -> None: if vstring: self.parse(vstring) - def parse(self, vstring): + def parse(self, vstring: str) -> None: self.vstring = vstring components = [x for x in self.component_re.split(vstring) if x and x != "."] for i, obj in enumerate(components): @@ -19,43 +25,43 @@ def parse(self, vstring): self.version = components - def __str__(self): + def __str__(self) -> str: return self.vstring - def __repr__(self): + def __repr__(self) -> str: return "LooseVersion ('%s')" % str(self) - def __eq__(self, other): + def __eq__(self, other): # type: ignore[no-untyped-def] c = self._cmp(other) if c is NotImplemented: return c return c == 0 - def __lt__(self, other): + def __lt__(self, other): # type: ignore[no-untyped-def] c = self._cmp(other) if c is NotImplemented: return c return c < 0 - def __le__(self, other): + def __le__(self, other): # type: ignore[no-untyped-def] c = self._cmp(other) if c is NotImplemented: return c return c <= 0 - def __gt__(self, other): + def __gt__(self, other): # type: ignore[no-untyped-def] c = self._cmp(other) if c is NotImplemented: return c return c > 0 - def __ge__(self, other): + def __ge__(self, other): # type: ignore[no-untyped-def] c = self._cmp(other) if c is NotImplemented: return c return c >= 0 - def _cmp(self, other): + def _cmp(self, other): # type: ignore[no-untyped-def] if isinstance(other, str): other = LooseVersion(other) elif not isinstance(other, LooseVersion):