diff --git a/python/doc/source/parquet.rst b/python/doc/source/parquet.rst index d466ba128cd..b394f562aba 100644 --- a/python/doc/source/parquet.rst +++ b/python/doc/source/parquet.rst @@ -237,3 +237,44 @@ throughput: pq.read_table(where, nthreads=4) pq.ParquetDataset(where).read(nthreads=4) + +Reading a Parquet File from Azure Blob storage +---------------------------------------------- + +The code below shows how to use Azure's storage sdk along with pyarrow to read +a parquet file into a Pandas dataframe. +This is suitable for executing inside a Jupyter notebook running on a Python 3 +kernel. + +Dependencies: + +* python 3.6.2 +* azure-storage 0.36.0 +* pyarrow 0.8.0 + +.. code-block:: python + + import pyarrow.parquet as pq + from io import BytesIO + from azure.storage.blob import BlockBlobService + + account_name = '...' + account_key = '...' + container_name = '...' + parquet_file = 'mysample.parquet' + + byte_stream = io.BytesIO() + block_blob_service = BlockBlobService(account_name=account_name, account_key=account_key) + try: + block_blob_service.get_blob_to_stream(container_name=container_name, blob_name=parquet_file, stream=byte_stream) + df = pq.read_table(source=byte_stream).to_pandas() + # Do work on df ... + finally: + # Add finally block to ensure closure of the stream + byte_stream.close() + +Notes: + +* The ``account_key`` can be found under ``Settings -> Access keys`` in the Microsoft Azure portal for a given container +* The code above works for a container with private access, Lease State = Available, Lease Status = Unlocked +* The parquet file was Blob Type = Block blob