diff --git a/.dockerignore b/.dockerignore index a369d7d59a6..f59f6e9bb53 100644 --- a/.dockerignore +++ b/.dockerignore @@ -57,6 +57,4 @@ !rust/parquet/build.rs !rust/parquet_derive/Cargo.toml !rust/parquet_derive_test/Cargo.toml -!rust/datafusion/Cargo.toml -!rust/datafusion/benches !rust/integration-testing/Cargo.toml diff --git a/docs/.gitignore b/docs/.gitignore index d2e9f6ccc8f..e896e5c91bb 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -17,3 +17,6 @@ _build source/python/generated +source/datafusion/generated +venv/ +bigfile.arrow diff --git a/docs/environment.yml b/docs/environment.yml index 8d1fe9bfb5d..7aa22c4ac9b 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -16,10 +16,11 @@ # under the License. channels: -- defaults -- conda-forge + - defaults + - conda-forge dependencies: -- arrow-cpp -- parquet-cpp -- pyarrow -- numpydoc + - arrow-cpp + - parquet-cpp + - pyarrow + - datafusion + - numpydoc diff --git a/docs/source/conf.py b/docs/source/conf.py index 2f813c07268..1de01ebea86 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -37,14 +37,10 @@ import warnings from unittest import mock +import datafusion import pyarrow - -sys.path.extend([ - os.path.join(os.path.dirname(__file__), - '..', '../..') - -]) +sys.path.extend([os.path.join(os.path.dirname(__file__), "..", "../..")]) # Suppresses all warnings printed when sphinx is traversing the code (e.g. # deprecation warnings) @@ -60,24 +56,24 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.doctest', - 'sphinx.ext.ifconfig', - 'sphinx.ext.mathjax', - 'sphinx.ext.viewcode', - 'sphinx.ext.napoleon', - 'IPython.sphinxext.ipython_directive', - 'IPython.sphinxext.ipython_console_highlighting', - 'breathe' + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.doctest", + "sphinx.ext.ifconfig", + "sphinx.ext.mathjax", + "sphinx.ext.viewcode", + "sphinx.ext.napoleon", + "IPython.sphinxext.ipython_directive", + "IPython.sphinxext.ipython_console_highlighting", + "breathe", ] # Show members for classes in .. autosummary autodoc_default_options = { - 'members': None, - 'undoc-members': None, - 'show-inheritance': None, - 'inherited-members': None + "members": None, + "undoc-members": None, + "show-inheritance": None, + "inherited-members": None, } # Breathe configuration @@ -88,19 +84,19 @@ autodoc_mock_imports = [] # ipython directive options -ipython_mplbackend = '' +ipython_mplbackend = "" # numpydoc configuration napoleon_use_rtype = False # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # -source_suffix = ['.rst'] +source_suffix = [".rst"] autosummary_generate = True @@ -109,23 +105,21 @@ # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'Apache Arrow' -copyright = f'2016-{datetime.datetime.now().year} Apache Software Foundation' -author = u'Apache Software Foundation' +project = "Apache Arrow" +copyright = f"2016-{datetime.datetime.now().year} Apache Software Foundation" +author = "Apache Software Foundation" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = os.environ.get('ARROW_DOCS_VERSION', - pyarrow.__version__) +version = os.environ.get("ARROW_DOCS_VERSION", pyarrow.__version__) # The full version, including alpha/beta/rc tags. -release = os.environ.get('ARROW_DOCS_VERSION', - pyarrow.__version__) +release = os.environ.get("ARROW_DOCS_VERSION", pyarrow.__version__) # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -146,7 +140,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -168,7 +162,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -185,7 +179,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'pydata_sphinx_theme' +html_theme = "pydata_sphinx_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -202,7 +196,7 @@ # The name for this set of Sphinx documents. # " v documentation" by default. # -html_title = u'Apache Arrow v{}'.format(version) +html_title = "Apache Arrow v{}".format(version) # A shorter title for the navigation bar. Default is the same as html_title. # @@ -222,10 +216,10 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Custom fixes to the RTD theme -html_css_files = ['theme_overrides.css'] +html_css_files = ["theme_overrides.css"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied @@ -247,8 +241,8 @@ # Custom sidebar templates, maps document names to template names. # html_sidebars = { -# '**': ['sidebar-logo.html', 'sidebar-search-bs.html', 'sidebar-nav-bs.html'], - '**': ['docs-sidebar.html'], + # '**': ['sidebar-logo.html', 'sidebar-search-bs.html', 'sidebar-nav-bs.html'], + "**": ["docs-sidebar.html"], } # Additional templates that should be rendered to pages, maps page names to @@ -308,34 +302,36 @@ # html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'arrowdoc' +htmlhelp_basename = "arrowdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'arrow.tex', u'Apache Arrow Documentation', - u'Apache Arrow Team', 'manual'), + ( + master_doc, + "arrow.tex", + "Apache Arrow Documentation", + "Apache Arrow Team", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of @@ -375,10 +371,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'arrow', u'Apache Arrow Documentation', - [author], 1) -] +man_pages = [(master_doc, "arrow", "Apache Arrow Documentation", [author], 1)] # If true, show URL addresses after external links. # @@ -391,9 +384,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'arrow', u'Apache Arrow Documentation', - author, 'Apache Arrow', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "arrow", + "Apache Arrow Documentation", + author, + "Apache Arrow", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. @@ -427,24 +426,26 @@ try: import pyarrow.cuda + cuda_enabled = True except ImportError: cuda_enabled = False # Mock pyarrow.cuda to avoid autodoc warnings. # XXX I can't get autodoc_mock_imports to work, so mock manually instead # (https://github.com/sphinx-doc/sphinx/issues/2174#issuecomment-453177550) - pyarrow.cuda = sys.modules['pyarrow.cuda'] = mock.Mock() + pyarrow.cuda = sys.modules["pyarrow.cuda"] = mock.Mock() try: import pyarrow.flight + flight_enabled = True except ImportError: flight_enabled = False - pyarrow.flight = sys.modules['pyarrow.flight'] = mock.Mock() + pyarrow.flight = sys.modules["pyarrow.flight"] = mock.Mock() def setup(app): # Use a config value to indicate whether CUDA API docs can be generated. # This will also rebuild appropriately when the value changes. - app.add_config_value('cuda_enabled', cuda_enabled, 'env') - app.add_config_value('flight_enabled', flight_enabled, 'env') + app.add_config_value("cuda_enabled", cuda_enabled, "env") + app.add_config_value("flight_enabled", flight_enabled, "env") diff --git a/docs/source/datafusion/api.rst b/docs/source/datafusion/api.rst new file mode 100644 index 00000000000..f81753e082e --- /dev/null +++ b/docs/source/datafusion/api.rst @@ -0,0 +1,30 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api: + +************* +API Reference +************* + +.. toctree:: + :maxdepth: 2 + + api/dataframe + api/execution_context + api/expression + api/functions diff --git a/docs/source/datafusion/api/dataframe.rst b/docs/source/datafusion/api/dataframe.rst new file mode 100644 index 00000000000..165ad068c42 --- /dev/null +++ b/docs/source/datafusion/api/dataframe.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.dataframe: +.. currentmodule:: datafusion + +DataFrame +========= + +.. autosummary:: + :toctree: ../generated/ + + datafusion.DataFrame diff --git a/docs/source/datafusion/api/execution_context.rst b/docs/source/datafusion/api/execution_context.rst new file mode 100644 index 00000000000..8c86ab77162 --- /dev/null +++ b/docs/source/datafusion/api/execution_context.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.execution_context: +.. currentmodule:: datafusion + +ExecutionContext +================ + +.. autosummary:: + :toctree: ../generated/ + + datafusion.ExecutionContext diff --git a/docs/source/datafusion/api/expression.rst b/docs/source/datafusion/api/expression.rst new file mode 100644 index 00000000000..deb61a88780 --- /dev/null +++ b/docs/source/datafusion/api/expression.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.expression: +.. currentmodule:: datafusion + +Expression +========== + +.. autosummary:: + :toctree: ../generated/ + + datafusion.Expression diff --git a/docs/source/datafusion/api/functions.rst b/docs/source/datafusion/api/functions.rst new file mode 100644 index 00000000000..2ec3b99cc6f --- /dev/null +++ b/docs/source/datafusion/api/functions.rst @@ -0,0 +1,30 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.functions: +.. currentmodule:: datafusion.functions + +Functions +========= + +.. autosummary:: + :toctree: ../generated/ + + datafusion.functions.col + datafusion.functions.lit + datafusion.functions.udaf + datafusion.functions.udf diff --git a/docs/source/datafusion/index.rst b/docs/source/datafusion/index.rst new file mode 100644 index 00000000000..db1141f2e63 --- /dev/null +++ b/docs/source/datafusion/index.rst @@ -0,0 +1,189 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +==================== +DataFusion in Python +==================== + +This is a Python library that binds to `Apache Arrow `_ in-memory query engine `DataFusion `_. + +Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python. + +It also allows you to use UDFs and UDAFs for complex operations. + +The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations. + +Its query engine, DataFusion, is written in `Rust `_), which makes strong assumptions about thread safety and lack of memory leaks. + +Technically, zero-copy is achieved via the `c data interface `_. + +How to use it +============= + +Simple usage: + +.. code-block:: python + + import datafusion + import pyarrow + + # an alias + f = datafusion.functions + + # create a context + ctx = datafusion.ExecutionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]]) + + # create a new statement + df = df.select( + f.col("a") + f.col("b"), + f.col("a") - f.col("b"), + ) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pyarrow.array([5, 7, 9]) + assert result.column(1) == pyarrow.array([-3, -3, -3]) + + +UDFs +---- + +.. code-block:: python + + def is_null(array: pyarrow.Array) -> pyarrow.Array: + return array.is_null() + + udf = f.udf(is_null, [pyarrow.int64()], pyarrow.bool_()) + + df = df.select(udf(f.col("a"))) + + +UDAF +---- + +.. code-block:: python + + import pyarrow + import pyarrow.compute + + + class Accumulator: + """ + Interface of a user-defined accumulation. + """ + def __init__(self): + self._sum = pyarrow.scalar(0.0) + + def to_scalars(self) -> [pyarrow.Scalar]: + return [self._sum] + + def update(self, values: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) + + def merge(self, states: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py()) + + def evaluate(self) -> pyarrow.Scalar: + return self._sum + + + df = ... + + udaf = f.udaf(Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()]) + + df = df.aggregate( + [], + [udaf(f.col("a"))] + ) + + +How to install (from pip) +========================= + +.. code-block:: shell + + pip install datafusion + + +How to develop +============== + +This assumes that you have rust and cargo installed. We use the workflow recommended by `pyo3 `_ and `maturin `_. + +Bootstrap: + +.. code-block:: shell + + # fetch this repo + git clone git@github.com:apache/arrow-datafusion.git + + cd arrow-datafusion/python + + # prepare development environment (used to build wheel / install in development) + python3 -m venv venv + # activate the venv + source venv/bin/activate + pip install -r requirements.txt + + +Whenever rust code changes (your changes or via `git pull`): + +.. code-block:: shell + + # make sure you activate the venv using "source venv/bin/activate" first + maturin develop + python -m pytest + + +How to update dependencies +========================== + +To change test dependencies, change the `requirements.in` and run + +.. code-block:: shell + + # install pip-tools (this can be done only once), also consider running in venv + pip install pip-tools + + # change requirements.in and then run + pip-compile --generate-hashes + + +To update dependencies, run + +.. code-block:: shell + + pip-compile update + + +More details `here `_ + + +.. toctree:: + :maxdepth: 2 + + api diff --git a/docs/source/index.rst b/docs/source/index.rst index 65aeb47ea9f..9bae323a14b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -44,6 +44,7 @@ target environment.** C/GLib C++ C# + Datafusion Go Java JavaScript @@ -67,7 +68,7 @@ target environment.** format/Integration format/CDataInterface format/CStreamInterface - format/Other + format/Other .. _toc.development: