From a288923d7600055b9af346ed74f88c7be598fbb1 Mon Sep 17 00:00:00 2001 From: Auberon Lopez Date: Wed, 19 Aug 2015 12:00:46 -0700 Subject: [PATCH 1/8] [SPARK-1267][PYSPARK] Adds pip installer for pyspark --- python/pyspark/__init__.py | 27 +++++++++++++++++++++++++++ python/pyspark/pyspark_version.py | 17 +++++++++++++++++ python/setup.py | 19 +++++++++++++++++++ 3 files changed, 63 insertions(+) create mode 100644 python/pyspark/pyspark_version.py create mode 100644 python/setup.py diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index 5f70ac6ed8fe6..4f0f898da649f 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -36,6 +36,33 @@ Finer-grained cache persistence levels. """ +import os +import sys + +import xml.etree.ElementTree as ET + +if (os.environ.get("SPARK_HOME", "not found") == "not found"): + raise ImportError("Environment variable SPARK_HOME is undefined.") + +spark_home = os.environ['SPARK_HOME'] +pom_xml_file_path = spark_home + '/pom.xml' + +try: + tree = ET.parse(pom_xml_file_path) + root = tree.getroot() + version_tag = root[4].text + snapshot_version = version_tag[:5] +except: + raise ImportError("Could not read the spark version, because pom.xml file" + + " is not found in SPARK_HOME(%s) directory." % (spark_home)) + +from pyspark.pyspark_version import __version__ +if (snapshot_version != __version__): + raise ImportError("Incompatible version of Spark(%s) and PySpark(%s)." % + (snapshot_version, __version__)) + +sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "python/lib/py4j-0.8.1-src.zip")) + from pyspark.conf import SparkConf from pyspark.context import SparkContext diff --git a/python/pyspark/pyspark_version.py b/python/pyspark/pyspark_version.py new file mode 100644 index 0000000000000..dd34f30853ac7 --- /dev/null +++ b/python/pyspark/pyspark_version.py @@ -0,0 +1,17 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +__version__ = '1.5.0' diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 0000000000000..178b979502fb7 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python + +from setuptools import setup + +exec(compile(open("pyspark/pyspark_version.py").read(), + "pyspark/pyspark_version.py", 'exec')) +VERSION = __version__ + +setup(name = 'pyspark', + version = VERSION, + description = 'Apache Spark Python API', + author = 'Prabin Banka', + author_email = 'prabin.banka@imaginea.com', + url = 'https://github.com/apache/spark/tree/master/python', + packages = ['pyspark', 'pyspark.mllib', 'pyspark.ml', 'pyspark.sql', 'pyspark.streaming'], + data_files = [('pyspark', ['pyspark/pyspark_version.py'])], + install_requires = ['numpy>=1.7', 'py4j==0.8.2.1', 'pandas'], + license = 'http://www.apache.org/licenses/LICENSE-2.0', + ) From 5e02ee1e26642ab5ebe9e5ac4caf199bee737d3a Mon Sep 17 00:00:00 2001 From: Auberon Lopez Date: Wed, 19 Aug 2015 12:59:26 -0700 Subject: [PATCH 2/8] fixing style --- python/setup.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/python/setup.py b/python/setup.py index 178b979502fb7..9daeb1bc9a90d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -6,14 +6,14 @@ "pyspark/pyspark_version.py", 'exec')) VERSION = __version__ -setup(name = 'pyspark', - version = VERSION, - description = 'Apache Spark Python API', - author = 'Prabin Banka', - author_email = 'prabin.banka@imaginea.com', - url = 'https://github.com/apache/spark/tree/master/python', - packages = ['pyspark', 'pyspark.mllib', 'pyspark.ml', 'pyspark.sql', 'pyspark.streaming'], - data_files = [('pyspark', ['pyspark/pyspark_version.py'])], - install_requires = ['numpy>=1.7', 'py4j==0.8.2.1', 'pandas'], - license = 'http://www.apache.org/licenses/LICENSE-2.0', - ) +setup(name='pyspark', + version=VERSION, + description='Apache Spark Python API', + author='Spark Developers', + author_email='dev@spark.apache.org', + url='https://github.com/apache/spark/tree/master/python', + packages=['pyspark', 'pyspark.mllib', 'pyspark.ml', 'pyspark.sql', 'pyspark.streaming'], + data_files=[('pyspark', ['pyspark/pyspark_version.py'])], + install_requires=['numpy>=1.7', 'py4j==0.8.2.1', 'pandas'], + license='http://www.apache.org/licenses/LICENSE-2.0', + ) From c71fb5a9353f76aa30fea818848e8c2359bb09b2 Mon Sep 17 00:00:00 2001 From: Auberon Lopez Date: Wed, 19 Aug 2015 13:12:31 -0700 Subject: [PATCH 3/8] fixing path munging --- python/pyspark/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index 4f0f898da649f..5fa8bc7fe5e19 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -45,7 +45,7 @@ raise ImportError("Environment variable SPARK_HOME is undefined.") spark_home = os.environ['SPARK_HOME'] -pom_xml_file_path = spark_home + '/pom.xml' +pom_xml_file_path = os.path.join(spark_home, 'pom.xml') try: tree = ET.parse(pom_xml_file_path) @@ -61,8 +61,6 @@ raise ImportError("Incompatible version of Spark(%s) and PySpark(%s)." % (snapshot_version, __version__)) -sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "python/lib/py4j-0.8.1-src.zip")) - from pyspark.conf import SparkConf from pyspark.context import SparkContext From 7ad1a8d629eb63d4dc71d7b05d634a4951826aad Mon Sep 17 00:00:00 2001 From: Auberon Lopez Date: Thu, 20 Aug 2015 11:01:40 -0700 Subject: [PATCH 4/8] remove data_files option --- python/setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 9daeb1bc9a90d..702722d1fc637 100644 --- a/python/setup.py +++ b/python/setup.py @@ -13,7 +13,6 @@ author_email='dev@spark.apache.org', url='https://github.com/apache/spark/tree/master/python', packages=['pyspark', 'pyspark.mllib', 'pyspark.ml', 'pyspark.sql', 'pyspark.streaming'], - data_files=[('pyspark', ['pyspark/pyspark_version.py'])], install_requires=['numpy>=1.7', 'py4j==0.8.2.1', 'pandas'], license='http://www.apache.org/licenses/LICENSE-2.0', ) From 55d760232deb94a0708fa66e7f6f1a4e05e10dbe Mon Sep 17 00:00:00 2001 From: Auberon Lopez Date: Mon, 5 Oct 2015 16:50:56 -0700 Subject: [PATCH 5/8] env check fix --- python/pyspark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index 5fa8bc7fe5e19..644f611e8d1f4 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -41,7 +41,7 @@ import xml.etree.ElementTree as ET -if (os.environ.get("SPARK_HOME", "not found") == "not found"): +if os.environ.get("SPARK_HOME") is None: raise ImportError("Environment variable SPARK_HOME is undefined.") spark_home = os.environ['SPARK_HOME'] From ff43762221b2a074da260c5ac4c9e5d2ef85c11e Mon Sep 17 00:00:00 2001 From: Auberon Lopez Date: Tue, 6 Oct 2015 16:11:38 -0700 Subject: [PATCH 6/8] extras_require --- python/setup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 702722d1fc637..699568a0322ed 100644 --- a/python/setup.py +++ b/python/setup.py @@ -13,6 +13,10 @@ author_email='dev@spark.apache.org', url='https://github.com/apache/spark/tree/master/python', packages=['pyspark', 'pyspark.mllib', 'pyspark.ml', 'pyspark.sql', 'pyspark.streaming'], - install_requires=['numpy>=1.7', 'py4j==0.8.2.1', 'pandas'], + install_requires=['py4j==0.8.2.1'], + extras_require = { + 'ml': ['numpy>=1.7'], + 'sql': ['pandas'] + }, license='http://www.apache.org/licenses/LICENSE-2.0', ) From 3b864cebd7cc2bac92e3281fd88dafb25a786e9e Mon Sep 17 00:00:00 2001 From: Auberon Lopez Date: Sat, 10 Oct 2015 14:50:23 -0700 Subject: [PATCH 7/8] check assembly jar for version --- python/pyspark/__init__.py | 42 +++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index 644f611e8d1f4..f4201ba28c913 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -37,24 +37,46 @@ """ import os +import re import sys +from os.path import isfile, join + import xml.etree.ElementTree as ET if os.environ.get("SPARK_HOME") is None: raise ImportError("Environment variable SPARK_HOME is undefined.") spark_home = os.environ['SPARK_HOME'] -pom_xml_file_path = os.path.join(spark_home, 'pom.xml') - -try: - tree = ET.parse(pom_xml_file_path) - root = tree.getroot() - version_tag = root[4].text - snapshot_version = version_tag[:5] -except: - raise ImportError("Could not read the spark version, because pom.xml file" + - " is not found in SPARK_HOME(%s) directory." % (spark_home)) +pom_xml_file_path = join(spark_home, 'pom.xml') +snapshot_version = None + +if isfile(pom_xml_file_path): + try: + tree = ET.parse(pom_xml_file_path) + root = tree.getroot() + version_tag = root[4].text + snapshot_version = version_tag[:5] + except: + raise ImportError("Could not read the spark version, because pom.xml file" + + " could not be read.") +else: + try: + lib_file_path = join(spark_home, "lib") + jars = [f for f in os.listdir(lib_file_path) if isfile(join(lib_file_path, f))] + + for jar in jars: + m = re.match(r"^spark-assembly-([0-9\.]+).*\.jar$", jar) + if m is not None and len(m.groups()) > 0: + snapshot_version = m.group(1) + + if snapshot_version is None: + raise ImportError("Could not read the spark version, because pom.xml or spark" + + " assembly jar could not be found.") + except OSError: + raise ImportError("Could not read the spark version, because pom.xml or lib directory" + + " could not be found in SPARK_HOME") + from pyspark.pyspark_version import __version__ if (snapshot_version != __version__): From 794daf2d4dc6b3deccb8670cb95996440c6b20dd Mon Sep 17 00:00:00 2001 From: Auberon Lopez Date: Tue, 10 Nov 2015 10:35:56 -0800 Subject: [PATCH 8/8] upped py4j version --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 699568a0322ed..42aaf1b57e323 100644 --- a/python/setup.py +++ b/python/setup.py @@ -13,7 +13,7 @@ author_email='dev@spark.apache.org', url='https://github.com/apache/spark/tree/master/python', packages=['pyspark', 'pyspark.mllib', 'pyspark.ml', 'pyspark.sql', 'pyspark.streaming'], - install_requires=['py4j==0.8.2.1'], + install_requires=['py4j==0.9'], extras_require = { 'ml': ['numpy>=1.7'], 'sql': ['pandas']