From a15910a482bbef7f12af3f9104472198729b066a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 15 May 2017 16:38:14 -0400 Subject: [PATCH 1/3] Rename InMemoryOutputStream to BufferOutputStream Change-Id: I6f3acbf17b68c13f849f15c193700df977377237 --- python/doc/source/api.rst | 2 +- python/doc/source/ipc.rst | 4 ++-- python/doc/source/memory.rst | 12 ++++++------ python/pyarrow/__init__.py | 8 +++++++- python/pyarrow/io.pxi | 2 +- python/pyarrow/tests/test_io.py | 6 +++--- python/pyarrow/tests/test_ipc.py | 4 ++-- python/pyarrow/tests/test_parquet.py | 2 +- 8 files changed, 23 insertions(+), 17 deletions(-) diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst index e7bea7013b9..c1459358b80 100644 --- a/python/doc/source/api.rst +++ b/python/doc/source/api.rst @@ -162,7 +162,7 @@ Input / Output and Shared Memory Buffer BufferReader - InMemoryOutputStream + BufferOutputStream NativeFile MemoryMappedFile memory_map diff --git a/python/doc/source/ipc.rst b/python/doc/source/ipc.rst index c7de9c46638..f0844cd2a9c 100644 --- a/python/doc/source/ipc.rst +++ b/python/doc/source/ipc.rst @@ -60,7 +60,7 @@ this we use :class:`~pyarrow.RecordBatchStreamWriter`, which can write to a writ .. ipython:: python - sink = pa.InMemoryOutputStream() + sink = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(sink, batch.schema) Here we used an in-memory Arrow buffer stream, but this could have been a @@ -109,7 +109,7 @@ The :class:`~pyarrow.RecordBatchFileWriter` has the same API as .. ipython:: python - sink = pa.InMemoryOutputStream() + sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) for i in range(10): diff --git a/python/doc/source/memory.rst b/python/doc/source/memory.rst index d1020da2464..ccc6298b661 100644 --- a/python/doc/source/memory.rst +++ b/python/doc/source/memory.rst @@ -104,8 +104,8 @@ There are several kinds of :class:`~pyarrow.NativeFile` options available: memory maps * :class:`~pyarrow.BufferReader`, for reading :class:`~pyarrow.Buffer` objects as a file -* :class:`~pyarrow.InMemoryOutputStream`, for writing data in-memory, producing - a Buffer at the end +* :class:`~pyarrow.BufferOutputStream`, for writing data in-memory, producing a + Buffer at the end * :class:`~pyarrow.HdfsFile`, for reading and writing data to the Hadoop Filesystem * :class:`~pyarrow.PythonFile`, for interfacing with Python file objects in C++ @@ -124,11 +124,11 @@ then precisely track amount of memory that has been allocated: PyArrow uses a default built-in memory pool, but in the future there may be additional memory pools (and subpools) to choose from. Let's consider an -``InMemoryOutputStream``, which is like a ``BytesIO``: +``BufferOutputStream``, which is like a ``BytesIO``: .. ipython:: python - stream = pa.InMemoryOutputStream() + stream = pa.BufferOutputStream() stream.write(b'foo') pa.total_allocated_bytes() for i in range(1024): stream.write(b'foo') @@ -150,7 +150,7 @@ pass in a custom memory pool: my_pool = pa.jemalloc_memory_pool() my_pool my_pool.bytes_allocated() - stream = pa.InMemoryOutputStream(my_pool) + stream = pa.BufferOutputStream(my_pool) stream.write(b'foo') my_pool.bytes_allocated() @@ -215,7 +215,7 @@ file interfaces that can read and write to Arrow Buffers. .. ipython:: python - writer = pa.InMemoryOutputStream() + writer = pa.BufferOutputStream() writer.write(b'hello, friends') buf = writer.get_result() diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index d6d2aa4a671..22f0c7c634c 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -70,7 +70,7 @@ Date32Value, Date64Value, TimestampValue) from pyarrow.lib import (HdfsFile, NativeFile, PythonFile, - Buffer, BufferReader, InMemoryOutputStream, + Buffer, BufferReader, BufferOutputStream, OSFile, MemoryMappedFile, memory_map, frombuffer, read_tensor, write_tensor, memory_map, create_memory_map, @@ -108,3 +108,9 @@ def jemalloc_memory_pool(): localfs = LocalFilesystem.get_instance() + + +# ---------------------------------------------------------------------- +# 0.4.0 deprecations + +InMemoryOutputStream = BufferOutputStream diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 4cbf603c2a5..7fd3b31339f 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -515,7 +515,7 @@ cdef shared_ptr[PoolBuffer] allocate_buffer(CMemoryPool* pool): return result -cdef class InMemoryOutputStream(NativeFile): +cdef class BufferOutputStream(NativeFile): cdef: shared_ptr[PoolBuffer] buffer diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 610dedc6a76..f91dc6fa05d 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -179,7 +179,7 @@ def test_memory_output_stream(): # 10 bytes val = b'dataabcdef' - f = pa.InMemoryOutputStream() + f = pa.BufferOutputStream() K = 1000 for i in range(K): @@ -192,7 +192,7 @@ def test_memory_output_stream(): def test_inmemory_write_after_closed(): - f = pa.InMemoryOutputStream() + f = pa.BufferOutputStream() f.write(b'ok') f.get_result() @@ -212,7 +212,7 @@ def make_buffer(bytes_obj): def test_nativefile_write_memoryview(): - f = pa.InMemoryOutputStream() + f = pa.BufferOutputStream() data = b'ok' arr = np.frombuffer(data, dtype='S1') diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 4d19804dac2..994876de3c1 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -209,7 +209,7 @@ def test_read_all(self): class TestInMemoryFile(TestFile): def _get_sink(self): - return pa.InMemoryOutputStream() + return pa.BufferOutputStream() def _get_source(self): return self.sink.get_result() @@ -219,7 +219,7 @@ def test_ipc_zero_copy_numpy(): df = pd.DataFrame({'foo': [1.5]}) batch = pa.RecordBatch.from_pandas(df) - sink = pa.InMemoryOutputStream() + sink = pa.BufferOutputStream() write_file(batch, sink) buffer = sink.get_result() reader = pa.BufferReader(buffer) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 7144de2a685..5dbe6574756 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -179,7 +179,7 @@ def _test_dataframe(size=10000, seed=0): def test_pandas_parquet_native_file_roundtrip(tmpdir): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) - imos = pa.InMemoryOutputStream() + imos = pa.BufferOutputStream() pq.write_table(arrow_table, imos, version="2.0") buf = imos.get_result() reader = pa.BufferReader(buf) From 69a99cdddbb22cff317de3c34ad3f4c51eddddae Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 15 May 2017 16:40:13 -0400 Subject: [PATCH 2/3] Fix Cython compilation Change-Id: I29a3c344300ae785a6e1a6cf9c41df33328d338e --- python/pyarrow/includes/libarrow.pxd | 5 +++-- python/pyarrow/io.pxi | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index b03dd59dffc..a7e2733ca81 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -507,8 +507,9 @@ cdef extern from "arrow/io/memory.h" namespace "arrow::io" nogil: CBufferReader(const shared_ptr[CBuffer]& buffer) CBufferReader(const uint8_t* data, int64_t nbytes) - cdef cppclass BufferOutputStream(OutputStream): - BufferOutputStream(const shared_ptr[ResizableBuffer]& buffer) + cdef cppclass CBufferOutputStream" arrow::io::BufferOutputStream"\ + (OutputStream): + CBufferOutputStream(const shared_ptr[ResizableBuffer]& buffer) cdef extern from "arrow/ipc/metadata.h" namespace "arrow::ipc" nogil: diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 7fd3b31339f..a153f222700 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -522,7 +522,7 @@ cdef class BufferOutputStream(NativeFile): def __cinit__(self, MemoryPool memory_pool=None): self.buffer = allocate_buffer(maybe_unbox_memory_pool(memory_pool)) - self.wr_file.reset(new BufferOutputStream( + self.wr_file.reset(new CBufferOutputStream( self.buffer)) self.is_readable = 0 self.is_writeable = 1 From 85b352c89d5c77d25a66e3bf6d9a6b8ee9b69ca1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 15 May 2017 18:17:04 -0400 Subject: [PATCH 3/3] Add more graceful deprecation warnings for renamed classes, test suite Change-Id: Ib19b789a6d7d34011d62caab719d89b7c5af7190 --- python/pyarrow/__init__.py | 31 ++++++++++- python/pyarrow/tests/test_deprecations.py | 64 +++++++++++++++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 python/pyarrow/tests/test_deprecations.py diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 22f0c7c634c..632a443ed0d 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -113,4 +113,33 @@ def jemalloc_memory_pool(): # ---------------------------------------------------------------------- # 0.4.0 deprecations -InMemoryOutputStream = BufferOutputStream +import warnings + +def _deprecate_class(old_name, new_name, klass, next_version='0.5.0'): + msg = ('pyarrow.{0} has been renamed to ' + '{1}, will be removed in {2}' + .format(old_name, new_name, next_version)) + def deprecated_factory(*args, **kwargs): + warnings.warn(msg, FutureWarning) + return klass(*args) + return deprecated_factory + +FileReader = _deprecate_class('FileReader', + 'RecordBatchFileReader', + RecordBatchFileReader, '0.5.0') + +FileWriter = _deprecate_class('FileWriter', + 'RecordBatchFileWriter', + RecordBatchFileWriter, '0.5.0') + +StreamReader = _deprecate_class('StreamReader', + 'RecordBatchStreamReader', + RecordBatchStreamReader, '0.5.0') + +StreamWriter = _deprecate_class('StreamWriter', + 'RecordBatchStreamWriter', + RecordBatchStreamWriter, '0.5.0') + +InMemoryOutputStream = _deprecate_class('InMemoryOutputStream', + 'BufferOutputStream', + BufferOutputStream, '0.5.0') diff --git a/python/pyarrow/tests/test_deprecations.py b/python/pyarrow/tests/test_deprecations.py new file mode 100644 index 00000000000..62b96663833 --- /dev/null +++ b/python/pyarrow/tests/test_deprecations.py @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Check that various deprecation warnings are raised + +import pyarrow as pa +import pytest + + +def test_inmemory_output_stream(): + with pytest.warns(FutureWarning): + stream = pa.InMemoryOutputStream() + assert isinstance(stream, pa.BufferOutputStream) + + +def test_file_reader_writer(): + data = [ + pa.array([1, 2, 3, 4]), + pa.array(['foo', 'bar', 'baz', None]), + pa.array([True, None, False, True]) + ] + batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2']) + + sink = pa.BufferOutputStream() + + with pytest.warns(FutureWarning): + stream_writer = pa.StreamWriter(sink, batch.schema) + assert isinstance(stream_writer, pa.RecordBatchStreamWriter) + + sink2 = pa.BufferOutputStream() + with pytest.warns(FutureWarning): + file_writer = pa.FileWriter(sink2, batch.schema) + assert isinstance(file_writer, pa.RecordBatchFileWriter) + + file_writer.write_batch(batch) + stream_writer.write_batch(batch) + + file_writer.close() + stream_writer.close() + + buf = sink.get_result() + buf2 = sink2.get_result() + + with pytest.warns(FutureWarning): + stream_reader = pa.StreamReader(buf) + assert isinstance(stream_reader, pa.RecordBatchStreamReader) + + with pytest.warns(FutureWarning): + file_reader = pa.FileReader(buf2) + assert isinstance(file_reader, pa.RecordBatchFileReader)