From adfd9268fe4f7f859085efa6bf70a7dab2569f7b Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Fri, 29 Sep 2023 15:01:59 -0700
Subject: [PATCH 1/3] [SPARK-45390][PYTHON] Remove `distutils` usage

---
 LICENSE                                       |  1 +
 LICENSE-binary                                |  6 ++
 dev/.rat-excludes                             |  1 +
 python/pyspark/__init__.py                    |  4 -
 python/pyspark/loose_version.py               | 78 +++++++++++++++++++
 python/pyspark/pandas/plot/matplotlib.py      |  2 +-
 python/pyspark/pandas/supported_api_gen.py    |  2 +-
 .../pandas/tests/computation/test_any_all.py  |  1 -
 .../pandas/tests/computation/test_corrwith.py |  1 -
 .../pandas/tests/computation/test_cov.py      |  1 -
 .../tests/data_type_ops/test_date_ops.py      |  1 -
 .../diff_frames_ops/test_cov_corrwith.py      |  2 +-
 .../pandas/tests/groupby/test_aggregate.py    |  1 -
 .../pandas/tests/groupby/test_apply_func.py   |  1 -
 .../pandas/tests/groupby/test_head_tail.py    |  1 -
 .../pandas/tests/groupby/test_index.py        |  1 -
 .../pandas/tests/groupby/test_split_apply.py  |  1 -
 .../pyspark/pandas/tests/groupby/test_stat.py |  1 -
 .../pyspark/pandas/tests/indexes/test_base.py |  2 +-
 .../pandas/tests/indexes/test_category.py     |  2 +-
 .../pandas/tests/indexes/test_datetime.py     |  2 -
 .../pandas/tests/indexes/test_indexing.py     |  1 -
 .../pandas/tests/indexes/test_reindex.py      |  1 -
 .../pandas/tests/indexes/test_timedelta.py    |  1 -
 .../pandas/tests/series/test_compute.py       |  1 -
 python/pyspark/pandas/tests/test_csv.py       |  1 -
 .../pandas/tests/test_dataframe_conversion.py |  1 -
 .../pandas/tests/test_dataframe_spark_io.py   |  2 +-
 .../pandas/tests/test_ops_on_diff_frames.py   |  1 -
 .../tests/test_ops_on_diff_frames_groupby.py  |  1 -
 python/pyspark/pandas/tests/test_rolling.py   |  1 -
 .../pandas/tests/test_series_conversion.py    |  1 -
 .../pandas/tests/test_series_datetime.py      |  1 -
 .../pandas/tests/test_series_string.py        |  1 -
 python/pyspark/pandas/tests/test_stats.py     |  1 -
 python/pyspark/sql/connect/client/core.py     |  2 +-
 python/pyspark/sql/connect/session.py         |  2 +-
 python/pyspark/sql/connect/utils.py           |  3 +-
 python/pyspark/sql/pandas/conversion.py       |  2 +-
 python/pyspark/sql/pandas/serializers.py      |  2 +-
 python/pyspark/sql/pandas/utils.py            |  4 +-
 41 files changed, 98 insertions(+), 44 deletions(-)
 create mode 100644 python/pyspark/loose_version.py

diff --git a/LICENSE b/LICENSE
index 3fee963db7499..44983fd1259e5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -221,6 +221,7 @@ connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaR
 Python Software Foundation License
 ----------------------------------
 
+python/pyspark/loose_version.py
 python/docs/source/_static/copybutton.js
 
 BSD 3-Clause
diff --git a/LICENSE-binary b/LICENSE-binary
index f0f59e7d57644..30fca96a8832d 100644
--- a/LICENSE-binary
+++ b/LICENSE-binary
@@ -421,6 +421,12 @@ This section summarizes those components and their licenses. See licenses-binary
 for text of these licenses.
 
 
+Python Software Foundation License
+----------------------------------
+
+python/pyspark/loose_version.py
+
+
 BSD 2-Clause
 ------------
 
diff --git a/dev/.rat-excludes b/dev/.rat-excludes
index d8fc418c3146b..d755c0ff749c4 100644
--- a/dev/.rat-excludes
+++ b/dev/.rat-excludes
@@ -142,3 +142,4 @@ empty.proto
 LimitedInputStream.java
 TimSort.java
 xml-resources/*
+loose_version.py
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index b8bca7776dd5c..21c4dfce4e3ff 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -51,10 +51,6 @@
 from typing import cast, Any, Callable, Optional, TypeVar, Union
 from warnings import filterwarnings
 
-filterwarnings(
-    "ignore", message="distutils Version classes are deprecated. Use packaging.version instead."
-)
-
 from pyspark.conf import SparkConf
 from pyspark.rdd import RDD, RDDBarrier
 from pyspark.files import SparkFiles
diff --git a/python/pyspark/loose_version.py b/python/pyspark/loose_version.py
new file mode 100644
index 0000000000000..74d5285041f37
--- /dev/null
+++ b/python/pyspark/loose_version.py
@@ -0,0 +1,78 @@
+import re
+
+class LooseVersion:
+
+    component_re = re.compile(r'(\d+ | [a-z]+ | \.)', re.VERBOSE)
+
+    def __init__ (self, vstring=None):
+        if vstring:
+            self.parse(vstring)
+
+
+    def parse (self, vstring):
+        # I've given up on thinking I can reconstruct the version string
+        # from the parsed tuple -- so I just store the string here for
+        # use by __str__
+        self.vstring = vstring
+        components = [x for x in self.component_re.split(vstring)
+                              if x and x != '.']
+        for i, obj in enumerate(components):
+            try:
+                components[i] = int(obj)
+            except ValueError:
+                pass
+
+        self.version = components
+
+
+    def __str__ (self):
+        return self.vstring
+
+
+    def __repr__ (self):
+        return "LooseVersion ('%s')" % str(self)
+
+
+    def __eq__(self, other):
+        c = self._cmp(other)
+        if c is NotImplemented:
+            return c
+        return c == 0
+
+    def __lt__(self, other):
+        c = self._cmp(other)
+        if c is NotImplemented:
+            return c
+        return c < 0
+
+    def __le__(self, other):
+        c = self._cmp(other)
+        if c is NotImplemented:
+            return c
+        return c <= 0
+
+    def __gt__(self, other):
+        c = self._cmp(other)
+        if c is NotImplemented:
+            return c
+        return c > 0
+
+    def __ge__(self, other):
+        c = self._cmp(other)
+        if c is NotImplemented:
+            return c
+        return c >= 0
+
+    def _cmp (self, other):
+        if isinstance(other, str):
+            other = LooseVersion(other)
+        elif not isinstance(other, LooseVersion):
+            return NotImplemented
+
+        if self.version == other.version:
+            return 0
+        if self.version < other.version:
+            return -1
+        if self.version > other.version:
+            return 1
+
diff --git a/python/pyspark/pandas/plot/matplotlib.py b/python/pyspark/pandas/plot/matplotlib.py
index 0164ec9f980e4..42f30ebf7ae08 100644
--- a/python/pyspark/pandas/plot/matplotlib.py
+++ b/python/pyspark/pandas/plot/matplotlib.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-from distutils.version import LooseVersion
+from pyspark.loose_version import LooseVersion
 
 import matplotlib as mat
 import numpy as np
diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py
index f00757fe36678..a43ad19801181 100644
--- a/python/pyspark/pandas/supported_api_gen.py
+++ b/python/pyspark/pandas/supported_api_gen.py
@@ -19,7 +19,7 @@
 Generate 'Supported pandas APIs' documentation file
 """
 import warnings
-from distutils.version import LooseVersion
+from pyspark.loose_version import LooseVersion
 from enum import Enum, unique
 from inspect import getmembers, isclass, isfunction, signature
 from typing import Any, Callable, Dict, List, NamedTuple, Set, TextIO, Tuple
diff --git a/python/pyspark/pandas/tests/computation/test_any_all.py b/python/pyspark/pandas/tests/computation/test_any_all.py
index 64f293c48d64a..6c120aead4e65 100644
--- a/python/pyspark/pandas/tests/computation/test_any_all.py
+++ b/python/pyspark/pandas/tests/computation/test_any_all.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from distutils.version import LooseVersion
 import unittest
 
 import numpy as np
diff --git a/python/pyspark/pandas/tests/computation/test_corrwith.py b/python/pyspark/pandas/tests/computation/test_corrwith.py
index 4db61c1585430..b64bf2d411b25 100644
--- a/python/pyspark/pandas/tests/computation/test_corrwith.py
+++ b/python/pyspark/pandas/tests/computation/test_corrwith.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from distutils.version import LooseVersion
 import unittest
 
 
diff --git a/python/pyspark/pandas/tests/computation/test_cov.py b/python/pyspark/pandas/tests/computation/test_cov.py
index 979f0b73839c3..23e5ec587e9a9 100644
--- a/python/pyspark/pandas/tests/computation/test_cov.py
+++ b/python/pyspark/pandas/tests/computation/test_cov.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from distutils.version import LooseVersion
 import unittest
 import decimal
 
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py
index d4e44afba911d..33332503943d3 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py
@@ -17,7 +17,6 @@
 
 import datetime
 import unittest
-from distutils.version import LooseVersion
 
 import pandas as pd
 from pandas.api.types import CategoricalDtype
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py b/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py
index d3b36dab4514d..1de0a61ab4b5e 100644
--- a/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_cov_corrwith.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from distutils.version import LooseVersion
 
 import pandas as pd
 import numpy as np
 
 from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
 from pyspark.pandas.config import set_option, reset_option
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils
diff --git a/python/pyspark/pandas/tests/groupby/test_aggregate.py b/python/pyspark/pandas/tests/groupby/test_aggregate.py
index c2d7872c8434f..9e7f3f6cd113d 100644
--- a/python/pyspark/pandas/tests/groupby/test_aggregate.py
+++ b/python/pyspark/pandas/tests/groupby/test_aggregate.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 import unittest
-from distutils.version import LooseVersion
 
 import pandas as pd
 
diff --git a/python/pyspark/pandas/tests/groupby/test_apply_func.py b/python/pyspark/pandas/tests/groupby/test_apply_func.py
index da6337b0ca958..e7a30ff57b414 100644
--- a/python/pyspark/pandas/tests/groupby/test_apply_func.py
+++ b/python/pyspark/pandas/tests/groupby/test_apply_func.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from distutils.version import LooseVersion
 import unittest
 
 import numpy as np
diff --git a/python/pyspark/pandas/tests/groupby/test_head_tail.py b/python/pyspark/pandas/tests/groupby/test_head_tail.py
index 1a22db74f26f4..6f69b0bdee003 100644
--- a/python/pyspark/pandas/tests/groupby/test_head_tail.py
+++ b/python/pyspark/pandas/tests/groupby/test_head_tail.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from distutils.version import LooseVersion
 import unittest
 
 import numpy as np
diff --git a/python/pyspark/pandas/tests/groupby/test_index.py b/python/pyspark/pandas/tests/groupby/test_index.py
index 9c73e59eabf04..9219a65d2cb46 100644
--- a/python/pyspark/pandas/tests/groupby/test_index.py
+++ b/python/pyspark/pandas/tests/groupby/test_index.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from distutils.version import LooseVersion
 import unittest
 
 import pandas as pd
diff --git a/python/pyspark/pandas/tests/groupby/test_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply.py
index a3ef8c73de44a..e8648c9b0a84f 100644
--- a/python/pyspark/pandas/tests/groupby/test_split_apply.py
+++ b/python/pyspark/pandas/tests/groupby/test_split_apply.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from distutils.version import LooseVersion
 import unittest
 
 import pandas as pd
diff --git a/python/pyspark/pandas/tests/groupby/test_stat.py b/python/pyspark/pandas/tests/groupby/test_stat.py
index 695d079db49da..29991ae1d54c4 100644
--- a/python/pyspark/pandas/tests/groupby/test_stat.py
+++ b/python/pyspark/pandas/tests/groupby/test_stat.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from distutils.version import LooseVersion
 import unittest
 
 import numpy as np
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py
index ccdb575ff702d..e84ab60f1216f 100644
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -17,13 +17,13 @@
 
 import inspect
 import unittest
-from distutils.version import LooseVersion
 from datetime import datetime, timedelta
 
 import numpy as np
 import pandas as pd
 
 import pyspark.pandas as ps
+from pyspark.loose_version import LooseVersion
 from pyspark.pandas.exceptions import PandasNotImplementedError
 from pyspark.pandas.missing.indexes import (
     MissingPandasLikeDatetimeIndex,
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py
index 5b6bd2cad078e..0cd89711beedb 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -16,12 +16,12 @@
 #
 
 import unittest
-from distutils.version import LooseVersion
 
 import pandas as pd
 from pandas.api.types import CategoricalDtype
 
 import pyspark.pandas as ps
+from pyspark.loose_version import LooseVersion
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 
 
diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py
index 4eaefb514d917..d89e448dd4f9d 100644
--- a/python/pyspark/pandas/tests/indexes/test_datetime.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime.py
@@ -17,8 +17,6 @@
 
 import datetime
 
-from distutils.version import LooseVersion
-
 import numpy as np
 import pandas as pd
 
diff --git a/python/pyspark/pandas/tests/indexes/test_indexing.py b/python/pyspark/pandas/tests/indexes/test_indexing.py
index 111dd09696d79..3facb1929b4fe 100644
--- a/python/pyspark/pandas/tests/indexes/test_indexing.py
+++ b/python/pyspark/pandas/tests/indexes/test_indexing.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 import unittest
-from distutils.version import LooseVersion
 
 import numpy as np
 import pandas as pd
diff --git a/python/pyspark/pandas/tests/indexes/test_reindex.py b/python/pyspark/pandas/tests/indexes/test_reindex.py
index 26eb97fdb552e..1d544ea221bf9 100644
--- a/python/pyspark/pandas/tests/indexes/test_reindex.py
+++ b/python/pyspark/pandas/tests/indexes/test_reindex.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 import unittest
-from distutils.version import LooseVersion
 
 import numpy as np
 import pandas as pd
diff --git a/python/pyspark/pandas/tests/indexes/test_timedelta.py b/python/pyspark/pandas/tests/indexes/test_timedelta.py
index 5321f96eeaba7..6bab794f3abbb 100644
--- a/python/pyspark/pandas/tests/indexes/test_timedelta.py
+++ b/python/pyspark/pandas/tests/indexes/test_timedelta.py
@@ -17,7 +17,6 @@
 
 import unittest
 from datetime import timedelta
-from distutils.version import LooseVersion
 
 import pandas as pd
 
diff --git a/python/pyspark/pandas/tests/series/test_compute.py b/python/pyspark/pandas/tests/series/test_compute.py
index 3f6a24708db52..9e48de893a130 100644
--- a/python/pyspark/pandas/tests/series/test_compute.py
+++ b/python/pyspark/pandas/tests/series/test_compute.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 import unittest
-from distutils.version import LooseVersion
 from itertools import product
 
 import numpy as np
diff --git a/python/pyspark/pandas/tests/test_csv.py b/python/pyspark/pandas/tests/test_csv.py
index a367dd72be1e5..a62388050472c 100644
--- a/python/pyspark/pandas/tests/test_csv.py
+++ b/python/pyspark/pandas/tests/test_csv.py
@@ -20,7 +20,6 @@
 import tempfile
 import unittest
 from contextlib import contextmanager
-from distutils.version import LooseVersion
 
 import pandas as pd
 import numpy as np
diff --git a/python/pyspark/pandas/tests/test_dataframe_conversion.py b/python/pyspark/pandas/tests/test_dataframe_conversion.py
index 283849a06d58a..134cf8bd1c103 100644
--- a/python/pyspark/pandas/tests/test_dataframe_conversion.py
+++ b/python/pyspark/pandas/tests/test_dataframe_conversion.py
@@ -21,7 +21,6 @@
 import tempfile
 import unittest
 import sys
-from distutils.version import LooseVersion
 
 import numpy as np
 import pandas as pd
diff --git a/python/pyspark/pandas/tests/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/test_dataframe_spark_io.py
index 1667524910b9b..41be0eee4b864 100644
--- a/python/pyspark/pandas/tests/test_dataframe_spark_io.py
+++ b/python/pyspark/pandas/tests/test_dataframe_spark_io.py
@@ -18,12 +18,12 @@
 import unittest
 import glob
 import os
-from distutils.version import LooseVersion
 
 import numpy as np
 import pandas as pd
 
 from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 
 
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
index f39a3c4a0abc0..12f81a2e8588e 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-from distutils.version import LooseVersion
 from itertools import product
 import unittest
 
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
index 5a8b1e3792016..0b8fe26cb8381 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
@@ -16,7 +16,6 @@
 #
 
 import unittest
-from distutils.version import LooseVersion
 
 import pandas as pd
 
diff --git a/python/pyspark/pandas/tests/test_rolling.py b/python/pyspark/pandas/tests/test_rolling.py
index 526962e3bbdd2..c7e49eab5bb5e 100644
--- a/python/pyspark/pandas/tests/test_rolling.py
+++ b/python/pyspark/pandas/tests/test_rolling.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 import unittest
-from distutils.version import LooseVersion
 
 import numpy as np
 import pandas as pd
diff --git a/python/pyspark/pandas/tests/test_series_conversion.py b/python/pyspark/pandas/tests/test_series_conversion.py
index b0a97b0a6f811..926c641ebc9c7 100644
--- a/python/pyspark/pandas/tests/test_series_conversion.py
+++ b/python/pyspark/pandas/tests/test_series_conversion.py
@@ -17,7 +17,6 @@
 
 import unittest
 import sys
-from distutils.version import LooseVersion
 
 import pandas as pd
 
diff --git a/python/pyspark/pandas/tests/test_series_datetime.py b/python/pyspark/pandas/tests/test_series_datetime.py
index c7ffc0675c6d8..89d4b70e0b51c 100644
--- a/python/pyspark/pandas/tests/test_series_datetime.py
+++ b/python/pyspark/pandas/tests/test_series_datetime.py
@@ -17,7 +17,6 @@
 
 import datetime
 import unittest
-from distutils.version import LooseVersion
 
 import numpy as np
 import pandas as pd
diff --git a/python/pyspark/pandas/tests/test_series_string.py b/python/pyspark/pandas/tests/test_series_string.py
index 93c6473f7d37c..b8d35764f1bc8 100644
--- a/python/pyspark/pandas/tests/test_series_string.py
+++ b/python/pyspark/pandas/tests/test_series_string.py
@@ -19,7 +19,6 @@
 import numpy as np
 import re
 import unittest
-from distutils.version import LooseVersion
 
 from pyspark import pandas as ps
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py
index 1f5b6a732a566..40ee64a5f68d1 100644
--- a/python/pyspark/pandas/tests/test_stats.py
+++ b/python/pyspark/pandas/tests/test_stats.py
@@ -16,7 +16,6 @@
 #
 
 import unittest
-from distutils.version import LooseVersion
 import numpy as np
 import pandas as pd
 
diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py
index 817a92b8faa7e..db7f8e6dc75c9 100644
--- a/python/pyspark/sql/connect/client/core.py
+++ b/python/pyspark/sql/connect/client/core.py
@@ -19,6 +19,7 @@
     "SparkConnectClient",
 ]
 
+from pyspark.loose_version import LooseVersion
 from pyspark.sql.connect.utils import check_dependencies
 
 check_dependencies(__name__)
@@ -31,7 +32,6 @@
 import urllib.parse
 import uuid
 import sys
-from distutils.version import LooseVersion
 from types import TracebackType
 from typing import (
     Iterable,
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
index e5d1d95a69967..53bf19b78c897 100644
--- a/python/pyspark/sql/connect/session.py
+++ b/python/pyspark/sql/connect/session.py
@@ -22,7 +22,6 @@
 import os
 import warnings
 from collections.abc import Sized
-from distutils.version import LooseVersion
 from functools import reduce
 from threading import RLock
 from typing import (
@@ -50,6 +49,7 @@
 import urllib
 
 from pyspark import SparkContext, SparkConf, __version__
+from pyspark.loose_version import LooseVersion
 from pyspark.sql.connect.client import SparkConnectClient, ChannelBuilder
 from pyspark.sql.connect.conf import RuntimeConf
 from pyspark.sql.connect.dataframe import DataFrame
diff --git a/python/pyspark/sql/connect/utils.py b/python/pyspark/sql/connect/utils.py
index 8872ba50633cd..e96529e44f8a3 100644
--- a/python/pyspark/sql/connect/utils.py
+++ b/python/pyspark/sql/connect/utils.py
@@ -16,6 +16,7 @@
 #
 import sys
 
+from pyspark.loose_version import LooseVersion
 from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
 
 
@@ -39,8 +40,6 @@ def require_minimum_grpc_version() -> None:
     """Raise ImportError if minimum version of grpc is not installed"""
     minimum_grpc_version = "1.48.1"
 
-    from distutils.version import LooseVersion
-
     try:
         import grpc
     except ImportError as error:
diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py
index abbc9f9441f06..5288f0e100bb2 100644
--- a/python/pyspark/sql/pandas/conversion.py
+++ b/python/pyspark/sql/pandas/conversion.py
@@ -26,9 +26,9 @@
     TYPE_CHECKING,
 )
 from warnings import warn
-from distutils.version import LooseVersion
 
 from pyspark.errors.exceptions.captured import unwrap_spark_exception
+from pyspark.loose_version import LooseVersion
 from pyspark.rdd import _load_from_socket
 from pyspark.sql.pandas.serializers import ArrowCollectSerializer
 from pyspark.sql.pandas.types import _dedup_names
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
index 9aa2be96add63..4c1d1c177d638 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -20,6 +20,7 @@
 """
 
 from pyspark.errors import PySparkRuntimeError, PySparkTypeError, PySparkValueError
+from pyspark.loose_version import LooseVersion
 from pyspark.serializers import Serializer, read_int, write_int, UTF8Deserializer, CPickleSerializer
 from pyspark.sql.pandas.types import (
     from_arrow_type,
@@ -188,7 +189,6 @@ def arrow_to_pandas(self, arrow_column, struct_in_pandas="dict", ndarray_as_list
         pandas_options = {"date_as_object": True}
 
         import pyarrow as pa
-        from distutils.version import LooseVersion
 
         if LooseVersion(pa.__version__) >= LooseVersion("13.0.0"):
             # A legacy option to coerce date32, date64, duration, and timestamp
diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py
index c7504f901e629..3c72fdca07bbe 100644
--- a/python/pyspark/sql/pandas/utils.py
+++ b/python/pyspark/sql/pandas/utils.py
@@ -15,14 +15,13 @@
 # limitations under the License.
 #
 
+from pyspark.loose_version import LooseVersion
 
 def require_minimum_pandas_version() -> None:
     """Raise ImportError if minimum version of Pandas is not installed"""
     # TODO(HyukjinKwon): Relocate and deduplicate the version specification.
     minimum_pandas_version = "1.4.4"
 
-    from distutils.version import LooseVersion
-
     try:
         import pandas
 
@@ -46,7 +45,6 @@ def require_minimum_pyarrow_version() -> None:
     # TODO(HyukjinKwon): Relocate and deduplicate the version specification.
     minimum_pyarrow_version = "4.0.0"
 
-    from distutils.version import LooseVersion
     import os
 
     try:

From d84d3d3c8a854404f2981dcd382ee2d5347c5239 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Sat, 30 Sep 2023 20:24:14 -0700
Subject: [PATCH 2/3] Run dev/reformat-python and remove comments

---
 python/pyspark/loose_version.py    | 25 ++++++++-----------------
 python/pyspark/sql/pandas/utils.py |  1 +
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/python/pyspark/loose_version.py b/python/pyspark/loose_version.py
index 74d5285041f37..f2a4bcf8d25a9 100644
--- a/python/pyspark/loose_version.py
+++ b/python/pyspark/loose_version.py
@@ -1,21 +1,16 @@
 import re
 
-class LooseVersion:
 
-    component_re = re.compile(r'(\d+ | [a-z]+ | \.)', re.VERBOSE)
+class LooseVersion:
+    component_re = re.compile(r"(\d+ | [a-z]+ | \.)", re.VERBOSE)
 
-    def __init__ (self, vstring=None):
+    def __init__(self, vstring=None):
         if vstring:
             self.parse(vstring)
 
-
-    def parse (self, vstring):
-        # I've given up on thinking I can reconstruct the version string
-        # from the parsed tuple -- so I just store the string here for
-        # use by __str__
+    def parse(self, vstring):
         self.vstring = vstring
-        components = [x for x in self.component_re.split(vstring)
-                              if x and x != '.']
+        components = [x for x in self.component_re.split(vstring) if x and x != "."]
         for i, obj in enumerate(components):
             try:
                 components[i] = int(obj)
@@ -24,15 +19,12 @@ def parse (self, vstring):
 
         self.version = components
 
-
-    def __str__ (self):
+    def __str__(self):
         return self.vstring
 
-
-    def __repr__ (self):
+    def __repr__(self):
         return "LooseVersion ('%s')" % str(self)
 
-
     def __eq__(self, other):
         c = self._cmp(other)
         if c is NotImplemented:
@@ -63,7 +55,7 @@ def __ge__(self, other):
             return c
         return c >= 0
 
-    def _cmp (self, other):
+    def _cmp(self, other):
         if isinstance(other, str):
             other = LooseVersion(other)
         elif not isinstance(other, LooseVersion):
@@ -75,4 +67,3 @@ def _cmp (self, other):
             return -1
         if self.version > other.version:
             return 1
-
diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py
index 3c72fdca07bbe..b62be20810288 100644
--- a/python/pyspark/sql/pandas/utils.py
+++ b/python/pyspark/sql/pandas/utils.py
@@ -17,6 +17,7 @@
 
 from pyspark.loose_version import LooseVersion
 
+
 def require_minimum_pandas_version() -> None:
     """Raise ImportError if minimum version of Pandas is not installed"""
     # TODO(HyukjinKwon): Relocate and deduplicate the version specification.

From 97f303a1fb489a093213b326deb356030e8fa3fa Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Sun, 1 Oct 2023 01:43:57 -0700
Subject: [PATCH 3/3] Address comments

---
 python/pyspark/__init__.py      |  1 -
 python/pyspark/loose_version.py | 26 ++++++++++++++++----------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index 21c4dfce4e3ff..a9a2a31702562 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -49,7 +49,6 @@
 from functools import wraps
 import types
 from typing import cast, Any, Callable, Optional, TypeVar, Union
-from warnings import filterwarnings
 
 from pyspark.conf import SparkConf
 from pyspark.rdd import RDD, RDDBarrier
diff --git a/python/pyspark/loose_version.py b/python/pyspark/loose_version.py
index f2a4bcf8d25a9..11c27f4ead8f0 100644
--- a/python/pyspark/loose_version.py
+++ b/python/pyspark/loose_version.py
@@ -1,14 +1,20 @@
+# Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+# https://github.com/python/cpython/blob/3.11/LICENSE
+# File originates from the cpython source
+# https://github.com/python/cpython/blob/3.11/Lib/distutils/version.py
+
 import re
+from typing import Optional
 
 
 class LooseVersion:
     component_re = re.compile(r"(\d+ | [a-z]+ | \.)", re.VERBOSE)
 
-    def __init__(self, vstring=None):
+    def __init__(self, vstring: Optional[str]) -> None:
         if vstring:
             self.parse(vstring)
 
-    def parse(self, vstring):
+    def parse(self, vstring: str) -> None:
         self.vstring = vstring
         components = [x for x in self.component_re.split(vstring) if x and x != "."]
         for i, obj in enumerate(components):
@@ -19,43 +25,43 @@ def parse(self, vstring):
 
         self.version = components
 
-    def __str__(self):
+    def __str__(self) -> str:
         return self.vstring
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return "LooseVersion ('%s')" % str(self)
 
-    def __eq__(self, other):
+    def __eq__(self, other):  # type: ignore[no-untyped-def]
         c = self._cmp(other)
         if c is NotImplemented:
             return c
         return c == 0
 
-    def __lt__(self, other):
+    def __lt__(self, other):  # type: ignore[no-untyped-def]
         c = self._cmp(other)
         if c is NotImplemented:
             return c
         return c < 0
 
-    def __le__(self, other):
+    def __le__(self, other):  # type: ignore[no-untyped-def]
         c = self._cmp(other)
         if c is NotImplemented:
             return c
         return c <= 0
 
-    def __gt__(self, other):
+    def __gt__(self, other):  # type: ignore[no-untyped-def]
         c = self._cmp(other)
         if c is NotImplemented:
             return c
         return c > 0
 
-    def __ge__(self, other):
+    def __ge__(self, other):  # type: ignore[no-untyped-def]
         c = self._cmp(other)
         if c is NotImplemented:
             return c
         return c >= 0
 
-    def _cmp(self, other):
+    def _cmp(self, other):  # type: ignore[no-untyped-def]
         if isinstance(other, str):
             other = LooseVersion(other)
         elif not isinstance(other, LooseVersion):