From 1a4e03128b98661c9df0f4d0d4ab06a6e206a9ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 3 Sep 2019 12:38:54 +0200 Subject: [PATCH 01/36] local file system --- cpp/src/arrow/filesystem/api.h | 26 +++ cpp/src/arrow/filesystem/filesystem.h | 1 + python/CMakeLists.txt | 2 +- python/pyarrow/_fs.pyx | 295 ++++++++++++++++++++++++ python/pyarrow/fs.py | 20 ++ python/pyarrow/includes/libarrow_fs.pxd | 91 ++++++++ python/pyarrow/lib.pxd | 16 ++ python/pyarrow/tests/test_fs.py | 264 +++++++++++++++++++++ python/setup.py | 1 + 9 files changed, 715 insertions(+), 1 deletion(-) create mode 100644 cpp/src/arrow/filesystem/api.h create mode 100644 python/pyarrow/_fs.pyx create mode 100644 python/pyarrow/fs.py create mode 100644 python/pyarrow/includes/libarrow_fs.pxd create mode 100644 python/pyarrow/tests/test_fs.py diff --git a/cpp/src/arrow/filesystem/api.h b/cpp/src/arrow/filesystem/api.h new file mode 100644 index 00000000000..fd8f566a78e --- /dev/null +++ b/cpp/src/arrow/filesystem/api.h @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_FILESYSTEM_API_H +#define ARROW_FILESYSTEM_API_H + +#include "arrow/filesystem/filesystem.h" // IWYU pragma: export +#include "arrow/filesystem/localfs.h" // IWYU pragma: export +#include "arrow/filesystem/mockfs.h" // IWYU pragma: export +#include "arrow/filesystem/s3fs.h" // IWYU pragma: export + +#endif // ARROW_FILESYSTEM_API_H diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index a00fa1269ed..fefe92f05a8 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -115,6 +115,7 @@ struct ARROW_EXPORT FileStats { }; /// \brief EXPERIMENTAL: file selector +// call it FileSelector? struct ARROW_EXPORT Selector { // The directory in which to select files. // If the path exists but doesn't point to a directory, this should be an error. diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 87d26d319dd..1529d77585c 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -379,7 +379,7 @@ if(UNIX) set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) endif() -set(CYTHON_EXTENSIONS lib _csv _json) +set(CYTHON_EXTENSIONS lib _fs _csv _json) set(LINK_LIBS arrow_shared arrow_python_shared) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx new file mode 100644 index 00000000000..2f422c13d08 --- /dev/null +++ b/python/pyarrow/_fs.pyx @@ -0,0 +1,295 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from pathlib import Path + +from pyarrow.compat import frombytes, tobytes +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow_fs cimport * +from pyarrow.lib import _detect_compression +from pyarrow.lib cimport ( + check_status, + NativeFile, + BufferedOutputStream, + BufferedInputStream, + CompressedInputStream, + CompressedOutputStream +) + + +cdef inline c_string _to_string(p): + # supports types: byte, str, pathlib.Path + return tobytes(str(p)) + + +cpdef enum FileType: + NonExistent + Uknown + File + Directory + + +cdef class TimePoint: + pass + + +cdef class FileStats: + + cdef CFileStats stats + + def __init__(self): + raise TypeError('dont initialize me') + + @staticmethod + cdef FileStats wrap(CFileStats stats): + cdef FileStats self = FileStats.__new__(FileStats) + self.stats = stats + return self + + @property + def type(self): + cdef CFileType ctype = self.stats.type() + if ctype == CFileType_NonExistent: + return FileType.NonExistent + elif ctype == CFileType_Unknown: + return FileType.Uknown + elif ctype == CFileType_File: + return FileType.File + elif ctype == CFileType_Directory: + return FileType.Directory + else: + raise ValueError('Unhandled FileType {}'.format(type)) + + @property + def path(self): + return Path(frombytes(self.stats.path())) + + @property + def base_name(self): + return frombytes(self.stats.base_name()) + + @property + def size(self): + return self.stats.size() + + @property + def extension(self): + return frombytes(self.stats.extension()) + + # @property + # def mtime(self): + # return self.stats.mtime() + + +cdef class Selector: + + cdef CSelector selector + + def __init__(self, base_dir='', bint allow_non_existent=False, + bint recursive=False): + self.base_dir = base_dir + self.recursive = recursive + self.allow_non_existent = allow_non_existent + + @property + def base_dir(self): + return Path(self.selector.base_dir) + + @base_dir.setter + def base_dir(self, base_dir): + self.selector.base_dir = _to_string(base_dir) + + @property + def allow_non_existent(self): + return self.selector.allow_non_existent + + @allow_non_existent.setter + def allow_non_existent(self, bint allow_non_existent): + self.selector.allow_non_existent = allow_non_existent + + @property + def recursive(self): + return self.selector.recursive + + @recursive.setter + def recursive(self, bint recursive): + self.selector.recursive = recursive + + +cdef class FileSystem: + + cdef: + shared_ptr[CFileSystem] wrapped + CFileSystem* fs + + def __init__(self): + raise TypeError('dont initialize me') + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + self.wrapped = wrapped + self.fs = wrapped.get() + + def stat(self, paths_or_selector): + cdef: + vector[CFileStats] stats + vector[c_string] paths + CSelector selector + + if isinstance(paths_or_selector, Selector): + selector = (paths_or_selector).selector + check_status(self.fs.GetTargetStats(selector, &stats)) + elif isinstance(paths_or_selector, (list, tuple)): + paths = [_to_string(s) for s in paths_or_selector] + check_status(self.fs.GetTargetStats(paths, &stats)) + else: + raise TypeError('Must pass either paths or a Selector') + + return [FileStats.wrap(stat) for stat in stats] + + def mkdir(self, path, bint recursive=True): + check_status(self.fs.CreateDir(_to_string(path), recursive=recursive)) + + def rmdir(self, path): + check_status(self.fs.DeleteDir(_to_string(path))) + + def mv(self, src, dest): + check_status(self.fs.Move(_to_string(src), _to_string(dest))) + + def cp(self, src, dest): + check_status(self.fs.CopyFile(_to_string(src), _to_string(dest))) + + def rm(self, path): + check_status(self.fs.DeleteFile(_to_string(path))) + + def _wrap_input_stream(self, stream, path, compression, buffer_size): + if buffer_size is not None and buffer_size != 0: + stream = BufferedInputStream(stream, buffer_size) + if compression == 'detect': + compression = _detect_compression(path) + if compression is not None: + stream = CompressedInputStream(stream, compression) + return stream + + def _wrap_output_stream(self, stream, path, compression, buffer_size): + if buffer_size is not None and buffer_size != 0: + stream = BufferedOutputStream(stream, buffer_size) + if compression == 'detect': + compression = _detect_compression(path) + if compression is not None: + stream = CompressedOutputStream(stream, compression) + return stream + + def input_file(self, path): + cdef: + c_string pathstr = _to_string(path) + NativeFile stream = NativeFile() + shared_ptr[CRandomAccessFile] in_handle + + with nogil: + check_status(self.fs.OpenInputFile(pathstr, &in_handle)) + + stream.set_random_access_file(in_handle) + stream.is_readable = True + return stream + + def input_stream(self, path, compression='detect', buffer_size=None): + cdef: + c_string pathstr = _to_string(path) + NativeFile stream = NativeFile() + shared_ptr[CInputStream] in_handle + + with nogil: + check_status(self.fs.OpenInputStream(pathstr, &in_handle)) + + stream.set_input_stream(in_handle) + stream.is_readable = True + + return self._wrap_input_stream( + stream, path=path, compression=compression, buffer_size=buffer_size + ) + + def output_stream(self, path, compression='detect', buffer_size=None): + cdef: + c_string pathstr = _to_string(path) + NativeFile stream = NativeFile() + shared_ptr[COutputStream] out_handle + + with nogil: + check_status(self.fs.OpenOutputStream(pathstr, &out_handle)) + + stream.set_output_stream(out_handle) + stream.is_writable = True + + return self._wrap_output_stream( + stream, path=path, compression=compression, buffer_size=buffer_size + ) + + def append_stream(self, path, compression='detect', buffer_size=None): + cdef: + c_string pathstr = _to_string(path) + NativeFile stream = NativeFile() + shared_ptr[COutputStream] out_handle + + with nogil: + check_status(self.fs.OpenAppendStream(pathstr, &out_handle)) + + stream.set_output_stream(out_handle) + stream.is_writable = True + + return self._wrap_output_stream( + stream, path=path, compression=compression, buffer_size=buffer_size + ) + + # CStatus OpenInputFile(const c_string& path, + # shared_ptr[RandomAccessFile]* out) + + +cdef class LocalFileSystem(FileSystem): + + cdef: + CLocalFileSystem* localfs + + def __init__(self): + cdef shared_ptr[CLocalFileSystem] wrapped + wrapped = make_shared[CLocalFileSystem]() + self.init( wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.localfs = wrapped.get() + + +cdef class SubTreeFileSystem(FileSystem): + + cdef: + CSubTreeFileSystem* subtreefs + + def __init__(self, base_path, FileSystem base_fs): + cdef: + c_string pathstr + shared_ptr[CSubTreeFileSystem] wrapped + + pathstr = tobytes(str(base_path)) + wrapped = make_shared[CSubTreeFileSystem](pathstr, base_fs.wrapped) + + self.init( wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.subtreefs = wrapped.get() diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py new file mode 100644 index 00000000000..cd5263acbca --- /dev/null +++ b/python/pyarrow/fs.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from pyarrow._fs import * # noqa diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd new file mode 100644 index 00000000000..fceaa28bf2a --- /dev/null +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from libcpp.functional cimport function + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport ( + InputStream as CInputStream, + OutputStream as COutputStream, + RandomAccessFile as CRandomAccessFile +) + + +cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: + + enum CFileType "arrow::fs::FileType": + CFileType_NonExistent "arrow::fs::FileType::NonExistent" + CFileType_Unknown "arrow::fs::FileType::Unknown" + CFileType_File "arrow::fs::FileType::File" + CFileType_Directory "arrow::fs::FileType::Directory" + + cdef cppclass CTimePoint "arrow::fs:TimePoint": + pass + + cdef cppclass CFileStats "arrow::fs::FileStats": + CFileStats() + CFileStats(CFileStats&&) + CFileStats& operator=(CFileStats&&) + CFileStats(const CFileStats&) + CFileStats& operator=(const CFileStats&) + + CFileType type() + void set_type(CFileType type) + c_string path() + void set_path(const c_string& path) + c_string base_name() + int64_t size() + void set_size(int64_t size) + c_string extension() + CTimePoint mtime() + void set_mtime(CTimePoint mtime) + + cdef cppclass CSelector "arrow::fs::Selector": + CSelector() + c_string base_dir + c_bool allow_non_existent + c_bool recursive + + cdef cppclass CFileSystem "arrow::fs::FileSystem": + CStatus GetTargetStats(const c_string& path, CFileStats* out) + CStatus GetTargetStats(const vector[c_string]& paths, + vector[CFileStats]* out) + CStatus GetTargetStats(const CSelector& select, + vector[CFileStats]* out) + CStatus CreateDir(const c_string& path, c_bool recursive) + CStatus DeleteDir(const c_string& path) + CStatus DeleteFile(const c_string& path) + CStatus DeleteFiles(const vector[c_string]& paths) + CStatus Move(const c_string& src, const c_string& dest) + CStatus CopyFile(const c_string& src, const c_string& dest) + CStatus OpenInputStream(const c_string& path, + shared_ptr[CInputStream]* out) + CStatus OpenInputFile(const c_string& path, + shared_ptr[CRandomAccessFile]* out) + CStatus OpenOutputStream(const c_string& path, + shared_ptr[COutputStream]* out) + CStatus OpenAppendStream(const c_string& path, + shared_ptr[COutputStream]* out) + + cdef cppclass CLocalFileSystem "arrow::fs::LocalFileSystem"(CFileSystem): + LocalFileSystem() + + cdef cppclass CSubTreeFileSystem "arrow::fs::SubTreeFileSystem"(CFileSystem): + CSubTreeFileSystem(const c_string& base_path, + shared_ptr[CFileSystem] base_fs) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 04a6157126c..2f4aeceaf57 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -452,6 +452,22 @@ cdef class NativeFile: cdef shared_ptr[OutputStream] get_output_stream(self) except * +cdef class BufferedInputStream(NativeFile): + pass + + +cdef class BufferedOutputStream(NativeFile): + pass + + +cdef class CompressedInputStream(NativeFile): + pass + + +cdef class CompressedOutputStream(NativeFile): + pass + + cdef class _CRecordBatchWriter: cdef: shared_ptr[CRecordBatchWriter] writer diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py new file mode 100644 index 00000000000..495ede8c941 --- /dev/null +++ b/python/pyarrow/tests/test_fs.py @@ -0,0 +1,264 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pathlib import Path + +import pytest + +from pyarrow import ArrowIOError +from pyarrow.fs import ( + FileType, + FileStats, + LocalFileSystem, + SubTreeFileSystem, + Selector +) +from pyarrow.tests.test_io import gzip_compress, gzip_decompress + + +@pytest.fixture(params=[ + pytest.param( + lambda tmp: LocalFileSystem(), + id='LocalFileSystem' + ), + pytest.param( + lambda tmp: SubTreeFileSystem(tmp, LocalFileSystem()), + id='SubTreeFileSystem(LocalFileSystem)' + ), + # s3 follows + # pytest.param( + # lambda tmp: SubTreeFileSystem(tmp, LocalFileSystem()), + # name='SubTreeFileSystem[LocalFileSystem]' + # ), +]) +def fs(request, tempdir): + return request.param(tempdir) + + +@pytest.fixture(params=[ + pytest.param(Path, id='Path'), + pytest.param(str, id='str') +]) +def testpath(request, fs, tempdir): + # we always use the tempdir for rewding and writing test artifacts, but + # if the filesystem is wrapped in a SubTreeFileSystem then we don't need + # to prepend the path with the tempdir, we also test the API with both + # pathlib.Path objects and plain python strings + if isinstance(fs, SubTreeFileSystem): + return lambda path: request.param(path) + else: + return lambda path: request.param(tempdir / path) + + +def test_stat_with_paths(fs, tempdir, testpath): + (tempdir / 'a' / 'aa' / 'aaa').mkdir(parents=True) + (tempdir / 'a' / 'bb').touch() + (tempdir / 'c.txt').touch() + + aaa_path = testpath('a/aa/aaa') + bb_path = testpath('a/bb') + c_path = testpath('c.txt') + + aaa, bb, c = fs.stat([aaa_path, bb_path, c_path]) + + assert aaa.path == Path(aaa_path) + assert aaa.base_name == 'aaa' + assert aaa.extension == '' + assert aaa.type == FileType.Directory + + assert bb.path == Path(bb_path) + assert bb.base_name == 'bb' + assert bb.extension == '' + assert bb.type == FileType.File + + assert c.path == Path(c_path) + assert c.base_name == 'c.txt' + assert c.extension == 'txt' + assert c.type == FileType.File + + selector = Selector(testpath(''), allow_non_existent=False, recursive=True) + nodes = fs.stat(selector) + assert len(nodes) == 5 + assert len(list(n for n in nodes if n.type == FileType.File)) == 2 + assert len(list(n for n in nodes if n.type == FileType.Directory)) == 3 + + +def test_mkdir(fs, tempdir, testpath): + directory = testpath('directory') + directory_ = tempdir / 'directory' + assert not directory_.exists() + fs.mkdir(directory) + assert directory_.exists() + + # recursive + directory = testpath('deeply/nested/directory') + directory_ = tempdir / 'deeply' / 'nested' / 'directory' + assert not directory_.exists() + with pytest.raises(ArrowIOError): + fs.mkdir(directory, recursive=False) + fs.mkdir(directory) + assert directory_.exists() + + +def test_rmdir(fs, tempdir, testpath): + folder = testpath('directory') + nested = testpath('nested/directory') + folder_ = tempdir / 'directory' + nested_ = tempdir / 'nested' / 'directory' + + folder_.mkdir() + nested_.mkdir(parents=True) + + assert folder_.exists() + fs.rmdir(folder) + assert not folder_.exists() + + assert nested_.exists() + fs.rmdir(nested) + assert not nested_.exists() + + +def test_cp(fs, tempdir, testpath): + # copy file + source = testpath('source-file') + source_ = tempdir / 'source-file' + source_.touch() + target = testpath('target-file') + target_ = tempdir / 'target-file' + assert not target_.exists() + fs.cp(source, target) + assert source_.exists() + assert target_.exists() + + +def test_mv(fs, tempdir, testpath): + # move directory + source = testpath('source-dir') + source_ = tempdir / 'source-dir' + source_.mkdir() + target = testpath('target-dir') + target_ = tempdir / 'target-dir' + assert not target_.exists() + fs.mv(source, target) + assert not source_.exists() + assert target_.exists() + + # move file + source = testpath('source-file') + source_ = tempdir / 'source-file' + source_.touch() + target = testpath('target-file') + target_ = tempdir / 'target-file' + assert not target_.exists() + fs.mv(source, target) + assert not source_.exists() + assert target_.exists() + + +def test_rm(fs, tempdir, testpath): + target = testpath('target-file') + target_ = tempdir / 'target-file' + target_.touch() + assert target_.exists() + fs.rm(target) + assert not target_.exists() + + nested = testpath('nested/target-file') + nested_ = tempdir / 'nested/target-file' + nested_.parent.mkdir() + nested_.touch() + assert nested_.exists() + fs.rm(nested) + assert not nested_.exists() + + +@pytest.mark.parametrize( + ('compression', 'buffer_size', 'compressor'), + [ + (None, None, lambda s: s), + (None, 64, lambda s: s), + ('gzip', None, gzip_compress), + ('gzip', 256, gzip_compress), + ] +) +def test_input_stream(fs, tempdir, testpath, compression, buffer_size, + compressor): + file = testpath('abc') + file_ = tempdir / 'abc' + data = b'some data' * 1024 + file_.write_bytes(compressor(data)) + + with fs.input_stream(file, compression, buffer_size) as f: + result = f.read() + + assert result == data + + +def test_input_file(fs, tempdir, testpath): + file = testpath('abc') + file_ = tempdir / 'abc' + data = b'some data' * 1024 + file_.write_bytes(data) + + read_from = len(b'some data') * 512 + with fs.input_file(file) as f: + f.seek(read_from) + result = f.read() + + assert result == data[read_from:] + + +@pytest.mark.parametrize( + ('compression', 'buffer_size', 'decompressor'), + [ + (None, None, lambda s: s), + (None, 64, lambda s: s), + ('gzip', None, gzip_decompress), + ('gzip', 256, gzip_decompress), + ] +) +def test_output_stream(fs, tempdir, testpath, compression, buffer_size, + decompressor): + file = testpath('abc') + file_ = tempdir / 'abc' + + data = b'some data' * 1024 + with fs.output_stream(file, compression, buffer_size) as f: + f.write(data) + + assert decompressor(file_.read_bytes()) == data + + +@pytest.mark.parametrize( + ('compression', 'buffer_size', 'compressor', 'decompressor'), + [ + (None, None, lambda s: s, lambda s: s), + (None, 64, lambda s: s, lambda s: s), + ('gzip', None, gzip_compress, gzip_decompress), + ('gzip', 256, gzip_compress, gzip_decompress), + ] +) +def test_append_stream(fs, tempdir, testpath, compression, buffer_size, + compressor, decompressor): + file = testpath('abc') + file_ = tempdir / 'abc' + file_.write_bytes(compressor(b'already existing')) + + with fs.append_stream(file, compression, buffer_size) as f: + f.write(b'\nnewly added') + + assert decompressor(file_.read_bytes()) == b'already existing\nnewly added' diff --git a/python/setup.py b/python/setup.py index 0187403a839..5db1b7d1fa2 100755 --- a/python/setup.py +++ b/python/setup.py @@ -168,6 +168,7 @@ def initialize_options(self): CYTHON_MODULE_NAMES = [ 'lib', + '_fs', '_csv', '_json', '_cuda', From a6a62434ac7012dcd4cd9d0f4741140b5607d52e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 3 Sep 2019 15:21:55 +0200 Subject: [PATCH 02/36] pathlib --- python/pyarrow/tests/test_fs.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 495ede8c941..803bf009e3e 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -15,7 +15,12 @@ # specific language governing permissions and limitations # under the License. -from pathlib import Path +import os +import subprocess +try: + import pathlib +except ImportError: + import pathlib2 as pathlib # py2 compat import pytest @@ -50,7 +55,7 @@ def fs(request, tempdir): @pytest.fixture(params=[ - pytest.param(Path, id='Path'), + pytest.param(pathlib.Path, id='Path'), pytest.param(str, id='str') ]) def testpath(request, fs, tempdir): From cd217416d87bcb88ca4ca6e7625493d30d5b1874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 3 Sep 2019 15:24:28 +0200 Subject: [PATCH 03/36] remove comment --- python/pyarrow/_fs.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 2f422c13d08..52519c068fe 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -256,9 +256,6 @@ cdef class FileSystem: stream, path=path, compression=compression, buffer_size=buffer_size ) - # CStatus OpenInputFile(const c_string& path, - # shared_ptr[RandomAccessFile]* out) - cdef class LocalFileSystem(FileSystem): From fd0b4ca66f6cd5ee764e702bb3b528f510cf8fd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 3 Sep 2019 15:32:53 +0200 Subject: [PATCH 04/36] flake8 --- python/pyarrow/tests/test_fs.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 803bf009e3e..c773bde7101 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -import os -import subprocess try: import pathlib except ImportError: @@ -27,7 +25,6 @@ from pyarrow import ArrowIOError from pyarrow.fs import ( FileType, - FileStats, LocalFileSystem, SubTreeFileSystem, Selector @@ -80,17 +77,17 @@ def test_stat_with_paths(fs, tempdir, testpath): aaa, bb, c = fs.stat([aaa_path, bb_path, c_path]) - assert aaa.path == Path(aaa_path) + assert aaa.path == pathlib.Path(aaa_path) assert aaa.base_name == 'aaa' assert aaa.extension == '' assert aaa.type == FileType.Directory - assert bb.path == Path(bb_path) + assert bb.path == pathlib.Path(bb_path) assert bb.base_name == 'bb' assert bb.extension == '' assert bb.type == FileType.File - assert c.path == Path(c_path) + assert c.path == pathlib.Path(c_path) assert c.base_name == 'c.txt' assert c.extension == 'txt' assert c.type == FileType.File From 5de88810ec3406add42daf658f9bdbcea5ff1554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 4 Sep 2019 15:18:38 +0200 Subject: [PATCH 05/36] mtime --- cpp/cmake_modules/FindClangTools.cmake | 4 +- cpp/src/arrow/python/CMakeLists.txt | 1 + cpp/src/arrow/python/api.h | 1 + cpp/src/arrow/python/arrow_to_pandas.cc | 2 +- .../python/{util/datetime.h => datetime.cc} | 82 ++++------- cpp/src/arrow/python/datetime.h | 76 ++++++++++ cpp/src/arrow/python/deserialize.cc | 2 +- cpp/src/arrow/python/inference.cc | 2 +- cpp/src/arrow/python/numpy_to_arrow.cc | 2 +- cpp/src/arrow/python/python_to_arrow.cc | 2 +- cpp/src/arrow/python/serialize.cc | 2 +- python/pyarrow/_fs.pyx | 66 ++++----- python/pyarrow/includes/common.pxd | 1 + python/pyarrow/includes/libarrow.pxd | 6 + python/pyarrow/includes/libarrow_fs.pxd | 2 +- python/pyarrow/tests/test_fs.py | 135 +++++++++--------- 16 files changed, 226 insertions(+), 160 deletions(-) rename cpp/src/arrow/python/{util/datetime.h => datetime.cc} (79%) create mode 100644 cpp/src/arrow/python/datetime.h diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake index a6ecbf78437..be251e7b6c4 100644 --- a/cpp/cmake_modules/FindClangTools.cmake +++ b/cpp/cmake_modules/FindClangTools.cmake @@ -75,7 +75,7 @@ function(FIND_CLANG_TOOL NAME OUTPUT VERSION_CHECK_PATTERN) endfunction() find_clang_tool(clang-tidy CLANG_TIDY_BIN - "LLVM version ${ARROW_LLVM_MAJOR_VERSION}\\.${ARROW_LLVM_MINOR_VERSION}") + "LLVM version ${ARROW_LLVM_MAJOR_VERSION}") if(CLANG_TIDY_BIN) set(CLANG_TIDY_FOUND 1) message(STATUS "clang-tidy found at ${CLANG_TIDY_BIN}") @@ -86,7 +86,7 @@ endif() find_clang_tool( clang-format CLANG_FORMAT_BIN - "^clang-format version ${ARROW_LLVM_MAJOR_VERSION}\\.${ARROW_LLVM_MINOR_VERSION}") + "^clang-format version ${ARROW_LLVM_MAJOR_VERSION}") if(CLANG_FORMAT_BIN) set(CLANG_FORMAT_FOUND 1) message(STATUS "clang-format found at ${CLANG_FORMAT_BIN}") diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index a89eba91c06..85a4aadf3f4 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -32,6 +32,7 @@ set(ARROW_PYTHON_SRCS benchmark.cc common.cc config.cc + datetime.cc decimal.cc deserialize.cc extension_type.cc diff --git a/cpp/src/arrow/python/api.h b/cpp/src/arrow/python/api.h index 6bbfcbfa34b..c9eb6e87fb5 100644 --- a/cpp/src/arrow/python/api.h +++ b/cpp/src/arrow/python/api.h @@ -20,6 +20,7 @@ #include "arrow/python/arrow_to_pandas.h" #include "arrow/python/common.h" +#include "arrow/python/datetime.h" #include "arrow/python/deserialize.h" #include "arrow/python/helpers.h" #include "arrow/python/inference.h" diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index beabcebf9c3..ceddfd5d329 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -46,13 +46,13 @@ #include "arrow/python/common.h" #include "arrow/python/config.h" +#include "arrow/python/datetime.h" #include "arrow/python/decimal.h" #include "arrow/python/helpers.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/numpy_internal.h" #include "arrow/python/python_to_arrow.h" #include "arrow/python/type_traits.h" -#include "arrow/python/util/datetime.h" namespace arrow { diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/datetime.cc similarity index 79% rename from cpp/src/arrow/python/util/datetime.h rename to cpp/src/arrow/python/datetime.cc index 6194b7f7115..6fc798c0ec3 100644 --- a/cpp/src/arrow/python/util/datetime.h +++ b/cpp/src/arrow/python/datetime.cc @@ -15,16 +15,17 @@ // specific language governing permissions and limitations // under the License. -#ifndef PYARROW_UTIL_DATETIME_H -#define PYARROW_UTIL_DATETIME_H - +#include #include +#include -#include #include "arrow/python/platform.h" +#include "arrow/python/datetime.h" #include "arrow/status.h" +#include "arrow/type.h" #include "arrow/util/logging.h" + namespace arrow { namespace py { @@ -155,25 +156,13 @@ static void get_date_from_days(int64_t days, int64_t* date_year, int64_t* date_m return; } -static inline int64_t PyTime_to_us(PyObject* pytime) { +int64_t PyTime_to_us(PyObject* pytime) { return (static_cast(PyDateTime_TIME_GET_HOUR(pytime)) * 3600000000LL + static_cast(PyDateTime_TIME_GET_MINUTE(pytime)) * 60000000LL + static_cast(PyDateTime_TIME_GET_SECOND(pytime)) * 1000000LL + PyDateTime_TIME_GET_MICROSECOND(pytime)); } -static inline int64_t PyTime_to_s(PyObject* pytime) { - return PyTime_to_us(pytime) / 1000000; -} - -static inline int64_t PyTime_to_ms(PyObject* pytime) { - return PyTime_to_us(pytime) / 1000; -} - -static inline int64_t PyTime_to_ns(PyObject* pytime) { - return PyTime_to_us(pytime) * 1000; -} - // Splitting time quantities, for example splitting total seconds into // minutes and remaining seconds. After we run // int64_t remaining = split_time(total, quotient, &next) @@ -181,7 +170,7 @@ static inline int64_t PyTime_to_ns(PyObject* pytime) { // total = next * quotient + remaining. Handles negative values by propagating // them: If total is negative, next will be negative and remaining will // always be non-negative. -static inline int64_t split_time(int64_t total, int64_t quotient, int64_t* next) { +static int64_t split_time(int64_t total, int64_t quotient, int64_t* next) { int64_t r = total % quotient; if (r < 0) { *next = total / quotient - 1; @@ -192,7 +181,7 @@ static inline int64_t split_time(int64_t total, int64_t quotient, int64_t* next) } } -static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, +static Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, int64_t* hour, int64_t* minute, int64_t* second, int64_t* microsecond) { switch (unit) { @@ -220,7 +209,7 @@ static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, return Status::OK(); } -static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year, +static Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year, int64_t* month, int64_t* day) { switch (unit) { case DateUnit::MILLI: @@ -233,8 +222,7 @@ static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_ return Status::OK(); } -static inline Status PyTime_from_int(int64_t val, const TimeUnit::type unit, - PyObject** out) { +Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) { int64_t hour = 0, minute = 0, second = 0, microsecond = 0; RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); *out = PyTime_FromTime(static_cast(hour), static_cast(minute), @@ -242,7 +230,7 @@ static inline Status PyTime_from_int(int64_t val, const TimeUnit::type unit, return Status::OK(); } -static inline Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) { +Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) { int64_t year = 0, month = 0, day = 0; RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day)); *out = PyDate_FromDate(static_cast(year), static_cast(month), @@ -250,8 +238,8 @@ static inline Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject* return Status::OK(); } -static inline Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, - PyObject** out) { +Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) { + PyDateTime_IMPORT; int64_t hour = 0, minute = 0, second = 0, microsecond = 0; RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); int64_t total_days = 0; @@ -265,21 +253,25 @@ static inline Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, return Status::OK(); } -static inline int64_t PyDate_to_days(PyDateTime_Date* pydate) { - return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), - PyDateTime_GET_DAY(pydate)); +using TimePoint = + std::chrono::time_point; + +Status PyDateTime_from_TimePoint(TimePoint val, PyObject** out) { + auto nanos = val.time_since_epoch(); + auto micros = std::chrono::duration_cast(nanos); + if (micros < nanos) { + micros += std::chrono::microseconds(1); // ceil + } + RETURN_NOT_OK(PyDateTime_from_int(micros.count(), TimeUnit::MICRO, out)); + return Status::OK(); } -static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { - int64_t total_seconds = 0; - int64_t days = - get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), - PyDateTime_GET_DAY(pydate)); - total_seconds += days * 24 * 3600; - return total_seconds * 1000; +int64_t PyDate_to_days(PyDateTime_Date* pydate) { + return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), + PyDateTime_GET_DAY(pydate)); } -static inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) { +int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) { int64_t total_seconds = 0; total_seconds += PyDateTime_DATE_GET_SECOND(pydatetime); total_seconds += PyDateTime_DATE_GET_MINUTE(pydatetime) * 60; @@ -289,23 +281,5 @@ static inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) { (PyDate_to_ms(reinterpret_cast(pydatetime)) / 1000LL); } -static inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) { - int64_t date_ms = PyDateTime_to_s(pydatetime) * 1000; - int ms = PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000; - return date_ms + ms; -} - -static inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) { - int64_t ms = PyDateTime_to_s(pydatetime) * 1000; - int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime); - return ms * 1000 + us; -} - -static inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) { - return PyDateTime_to_us(pydatetime) * 1000; -} - } // namespace py } // namespace arrow - -#endif // PYARROW_UTIL_DATETIME_H diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h new file mode 100644 index 00000000000..e00c44642d2 --- /dev/null +++ b/cpp/src/arrow/python/datetime.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_UTIL_DATETIME_H +#define PYARROW_UTIL_DATETIME_H + +#include +#include + +#include "arrow/python/platform.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/logging.h" + +namespace arrow { +namespace py { + +int64_t PyTime_to_us(PyObject* pytime); + +inline int64_t PyTime_to_s(PyObject* pytime) { return PyTime_to_us(pytime) / 1000000; } + +inline int64_t PyTime_to_ms(PyObject* pytime) { return PyTime_to_us(pytime) / 1000; } + +inline int64_t PyTime_to_ns(PyObject* pytime) { return PyTime_to_us(pytime) * 1000; } + +Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out); +Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out); +Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out); + +using TimePoint = + std::chrono::time_point; + +Status PyDateTime_from_TimePoint(TimePoint val, PyObject** out); + +int64_t PyDate_to_days(PyDateTime_Date* pydate); + +inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { + return PyDate_to_days(pydate) * 24 * 3600 * 1000; +} + +int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime); + +inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) { + int64_t date_ms = PyDateTime_to_s(pydatetime) * 1000; + int ms = PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000; + return date_ms + ms; +} + +inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) { + int64_t ms = PyDateTime_to_s(pydatetime) * 1000; + int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime); + return ms * 1000 + us; +} + +inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) { + return PyDateTime_to_us(pydatetime) * 1000; +} + +} // namespace py +} // namespace arrow + +#endif // PYARROW_UTIL_DATETIME_H diff --git a/cpp/src/arrow/python/deserialize.cc b/cpp/src/arrow/python/deserialize.cc index 45f7d61890e..7cad8c9682e 100644 --- a/cpp/src/arrow/python/deserialize.cc +++ b/cpp/src/arrow/python/deserialize.cc @@ -39,11 +39,11 @@ #include "arrow/util/parsing.h" #include "arrow/python/common.h" +#include "arrow/python/datetime.h" #include "arrow/python/helpers.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/pyarrow.h" #include "arrow/python/serialize.h" -#include "arrow/python/util/datetime.h" namespace arrow { diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index eddba0ec4e0..e8af80ee28d 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -31,11 +31,11 @@ #include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/python/datetime.h" #include "arrow/python/decimal.h" #include "arrow/python/helpers.h" #include "arrow/python/iterators.h" #include "arrow/python/numpy_convert.h" -#include "arrow/python/util/datetime.h" namespace arrow { namespace py { diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index caee934c913..1cb68f4732d 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -48,13 +48,13 @@ #include "arrow/python/common.h" #include "arrow/python/config.h" +#include "arrow/python/datetime.h" #include "arrow/python/helpers.h" #include "arrow/python/iterators.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/numpy_internal.h" #include "arrow/python/python_to_arrow.h" #include "arrow/python/type_traits.h" -#include "arrow/python/util/datetime.h" namespace arrow { diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 4b4e759e523..f7b7b169c7f 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -38,13 +38,13 @@ #include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/python/datetime.h" #include "arrow/python/decimal.h" #include "arrow/python/helpers.h" #include "arrow/python/inference.h" #include "arrow/python/iterators.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/type_traits.h" -#include "arrow/python/util/datetime.h" namespace arrow { diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index aaed0736f63..a5a7dada584 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -41,12 +41,12 @@ #include "arrow/util/logging.h" #include "arrow/python/common.h" +#include "arrow/python/datetime.h" #include "arrow/python/helpers.h" #include "arrow/python/iterators.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/platform.h" #include "arrow/python/pyarrow.h" -#include "arrow/python/util/datetime.h" constexpr int32_t kMaxRecursionDepth = 100; diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 52519c068fe..b6cb4bcaf49 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -17,10 +17,14 @@ # cython: language_level = 3 -from pathlib import Path +try: + import pathlib +except ImportError: + import pathlib2 as pathlib # py2 compat from pyarrow.compat import frombytes, tobytes from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport PyDateTime_from_TimePoint from pyarrow.includes.libarrow_fs cimport * from pyarrow.lib import _detect_compression from pyarrow.lib cimport ( @@ -33,7 +37,7 @@ from pyarrow.lib cimport ( ) -cdef inline c_string _to_string(p): +cdef inline c_string _path_to_bytes(p): # supports types: byte, str, pathlib.Path return tobytes(str(p)) @@ -45,10 +49,6 @@ cpdef enum FileType: Directory -cdef class TimePoint: - pass - - cdef class FileStats: cdef CFileStats stats @@ -78,7 +78,7 @@ cdef class FileStats: @property def path(self): - return Path(frombytes(self.stats.path())) + return pathlib.Path(frombytes(self.stats.path())) @property def base_name(self): @@ -92,9 +92,11 @@ cdef class FileStats: def extension(self): return frombytes(self.stats.extension()) - # @property - # def mtime(self): - # return self.stats.mtime() + @property + def mtime(self): + cdef PyObject *out + check_status(PyDateTime_from_TimePoint(self.stats.mtime(), &out)) + return PyObject_to_object(out) cdef class Selector: @@ -109,11 +111,11 @@ cdef class Selector: @property def base_dir(self): - return Path(self.selector.base_dir) + return pathlib.Path(self.selector.base_dir) @base_dir.setter def base_dir(self, base_dir): - self.selector.base_dir = _to_string(base_dir) + self.selector.base_dir = _path_to_bytes(base_dir) @property def allow_non_existent(self): @@ -145,7 +147,7 @@ cdef class FileSystem: self.wrapped = wrapped self.fs = wrapped.get() - def stat(self, paths_or_selector): + def get_target_stats(self, paths_or_selector): cdef: vector[CFileStats] stats vector[c_string] paths @@ -155,27 +157,27 @@ cdef class FileSystem: selector = (paths_or_selector).selector check_status(self.fs.GetTargetStats(selector, &stats)) elif isinstance(paths_or_selector, (list, tuple)): - paths = [_to_string(s) for s in paths_or_selector] + paths = [_path_to_bytes(s) for s in paths_or_selector] check_status(self.fs.GetTargetStats(paths, &stats)) else: raise TypeError('Must pass either paths or a Selector') return [FileStats.wrap(stat) for stat in stats] - def mkdir(self, path, bint recursive=True): - check_status(self.fs.CreateDir(_to_string(path), recursive=recursive)) + def create_dir(self, path, bint recursive=True): + check_status(self.fs.CreateDir(_path_to_bytes(path), recursive=recursive)) - def rmdir(self, path): - check_status(self.fs.DeleteDir(_to_string(path))) + def delete_dir(self, path): + check_status(self.fs.DeleteDir(_path_to_bytes(path))) - def mv(self, src, dest): - check_status(self.fs.Move(_to_string(src), _to_string(dest))) + def move(self, src, dest): + check_status(self.fs.Move(_path_to_bytes(src), _path_to_bytes(dest))) - def cp(self, src, dest): - check_status(self.fs.CopyFile(_to_string(src), _to_string(dest))) + def copy_file(self, src, dest): + check_status(self.fs.CopyFile(_path_to_bytes(src), _path_to_bytes(dest))) - def rm(self, path): - check_status(self.fs.DeleteFile(_to_string(path))) + def delete_file(self, path): + check_status(self.fs.DeleteFile(_path_to_bytes(path))) def _wrap_input_stream(self, stream, path, compression, buffer_size): if buffer_size is not None and buffer_size != 0: @@ -195,9 +197,9 @@ cdef class FileSystem: stream = CompressedOutputStream(stream, compression) return stream - def input_file(self, path): + def open_input_file(self, path): cdef: - c_string pathstr = _to_string(path) + c_string pathstr = _path_to_bytes(path) NativeFile stream = NativeFile() shared_ptr[CRandomAccessFile] in_handle @@ -208,9 +210,9 @@ cdef class FileSystem: stream.is_readable = True return stream - def input_stream(self, path, compression='detect', buffer_size=None): + def open_input_stream(self, path, compression='detect', buffer_size=None): cdef: - c_string pathstr = _to_string(path) + c_string pathstr = _path_to_bytes(path) NativeFile stream = NativeFile() shared_ptr[CInputStream] in_handle @@ -224,9 +226,9 @@ cdef class FileSystem: stream, path=path, compression=compression, buffer_size=buffer_size ) - def output_stream(self, path, compression='detect', buffer_size=None): + def open_output_stream(self, path, compression='detect', buffer_size=None): cdef: - c_string pathstr = _to_string(path) + c_string pathstr = _path_to_bytes(path) NativeFile stream = NativeFile() shared_ptr[COutputStream] out_handle @@ -240,9 +242,9 @@ cdef class FileSystem: stream, path=path, compression=compression, buffer_size=buffer_size ) - def append_stream(self, path, compression='detect', buffer_size=None): + def open_append_stream(self, path, compression='detect', buffer_size=None): cdef: - c_string pathstr = _to_string(path) + c_string pathstr = _path_to_bytes(path) NativeFile stream = NativeFile() shared_ptr[COutputStream] out_handle diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 721facdfbb2..72247b776e7 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -29,6 +29,7 @@ from libcpp.unordered_set cimport unordered_set from cpython cimport PyObject cimport cpython + cdef extern from * namespace "std" nogil: cdef shared_ptr[T] static_pointer_cast[T, U](shared_ptr[U]) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 83df0e9533c..5d632217de8 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1281,6 +1281,12 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: object PyHalf_FromHalf(npy_half value) + cdef cppclass CTimePoint "arrow::py::TimePoint": + pass + + cdef CStatus PyDateTime_from_int(int64_t val, const TimeUnit unit, PyObject** out) + cdef CStatus PyDateTime_from_TimePoint(CTimePoint val, PyObject** out) + cdef cppclass PyConversionOptions: PyConversionOptions() diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index fceaa28bf2a..c29db700116 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -35,7 +35,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: CFileType_File "arrow::fs::FileType::File" CFileType_Directory "arrow::fs::FileType::Directory" - cdef cppclass CTimePoint "arrow::fs:TimePoint": + cdef cppclass CTimePoint "arrow::fs::TimePoint": pass cdef cppclass CFileStats "arrow::fs::FileStats": diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index c773bde7101..c79fc02a73d 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +from datetime import datetime try: import pathlib except ImportError: @@ -23,12 +24,7 @@ import pytest from pyarrow import ArrowIOError -from pyarrow.fs import ( - FileType, - LocalFileSystem, - SubTreeFileSystem, - Selector -) +from pyarrow.fs import FileType, LocalFileSystem, SubTreeFileSystem, Selector from pyarrow.tests.test_io import gzip_compress, gzip_decompress @@ -40,12 +36,7 @@ pytest.param( lambda tmp: SubTreeFileSystem(tmp, LocalFileSystem()), id='SubTreeFileSystem(LocalFileSystem)' - ), - # s3 follows - # pytest.param( - # lambda tmp: SubTreeFileSystem(tmp, LocalFileSystem()), - # name='SubTreeFileSystem[LocalFileSystem]' - # ), + ) ]) def fs(request, tempdir): return request.param(tempdir) @@ -66,44 +57,54 @@ def testpath(request, fs, tempdir): return lambda path: request.param(tempdir / path) -def test_stat_with_paths(fs, tempdir, testpath): - (tempdir / 'a' / 'aa' / 'aaa').mkdir(parents=True) - (tempdir / 'a' / 'bb').touch() - (tempdir / 'c.txt').touch() +def test_get_target_stats(fs, tempdir, testpath): + aaa, aaa_ = testpath('a/aa/aaa'), tempdir / 'a' / 'aa' / 'aaa' + bb, bb_ = testpath('a/bb'), tempdir / 'a' / 'bb' + c, c_ = testpath('c.txt'), tempdir / 'c.txt' + + aaa_.mkdir(parents=True) + bb_.touch() + c_.touch() - aaa_path = testpath('a/aa/aaa') - bb_path = testpath('a/bb') - c_path = testpath('c.txt') + def ceiled_mtime_range(path): + # arrow's filesystem implementation ceils mtime whereas pathlib rounds + return { + datetime.utcfromtimestamp(path.stat().st_mtime), + datetime.utcfromtimestamp(path.stat().st_mtime + 10**-6) + } - aaa, bb, c = fs.stat([aaa_path, bb_path, c_path]) + aaa_stat, bb_stat, c_stat = fs.get_target_stats([aaa, bb, c]) - assert aaa.path == pathlib.Path(aaa_path) - assert aaa.base_name == 'aaa' - assert aaa.extension == '' - assert aaa.type == FileType.Directory + assert aaa_stat.path == pathlib.Path(aaa) + assert aaa_stat.base_name == 'aaa' + assert aaa_stat.extension == '' + assert aaa_stat.type == FileType.Directory + assert aaa_stat.mtime in ceiled_mtime_range(aaa_) - assert bb.path == pathlib.Path(bb_path) - assert bb.base_name == 'bb' - assert bb.extension == '' - assert bb.type == FileType.File + assert bb_stat.path == pathlib.Path(bb) + assert bb_stat.base_name == 'bb' + assert bb_stat.extension == '' + assert bb_stat.type == FileType.File + assert bb_stat.mtime in ceiled_mtime_range(bb_) - assert c.path == pathlib.Path(c_path) - assert c.base_name == 'c.txt' - assert c.extension == 'txt' - assert c.type == FileType.File + assert c_stat.path == pathlib.Path(c) + assert c_stat.base_name == 'c.txt' + assert c_stat.extension == 'txt' + assert c_stat.type == FileType.File + assert c_stat.mtime in ceiled_mtime_range(c_) selector = Selector(testpath(''), allow_non_existent=False, recursive=True) - nodes = fs.stat(selector) + nodes = fs.get_target_stats(selector) assert len(nodes) == 5 assert len(list(n for n in nodes if n.type == FileType.File)) == 2 assert len(list(n for n in nodes if n.type == FileType.Directory)) == 3 -def test_mkdir(fs, tempdir, testpath): +def test_create_dir(fs, tempdir, testpath): directory = testpath('directory') directory_ = tempdir / 'directory' assert not directory_.exists() - fs.mkdir(directory) + fs.create_dir(directory) assert directory_.exists() # recursive @@ -111,12 +112,12 @@ def test_mkdir(fs, tempdir, testpath): directory_ = tempdir / 'deeply' / 'nested' / 'directory' assert not directory_.exists() with pytest.raises(ArrowIOError): - fs.mkdir(directory, recursive=False) - fs.mkdir(directory) + fs.create_dir(directory, recursive=False) + fs.create_dir(directory) assert directory_.exists() -def test_rmdir(fs, tempdir, testpath): +def test_delete_dir(fs, tempdir, testpath): folder = testpath('directory') nested = testpath('nested/directory') folder_ = tempdir / 'directory' @@ -126,15 +127,15 @@ def test_rmdir(fs, tempdir, testpath): nested_.mkdir(parents=True) assert folder_.exists() - fs.rmdir(folder) + fs.delete_dir(folder) assert not folder_.exists() assert nested_.exists() - fs.rmdir(nested) + fs.delete_dir(nested) assert not nested_.exists() -def test_cp(fs, tempdir, testpath): +def test_copy_file(fs, tempdir, testpath): # copy file source = testpath('source-file') source_ = tempdir / 'source-file' @@ -142,12 +143,12 @@ def test_cp(fs, tempdir, testpath): target = testpath('target-file') target_ = tempdir / 'target-file' assert not target_.exists() - fs.cp(source, target) + fs.copy_file(source, target) assert source_.exists() assert target_.exists() -def test_mv(fs, tempdir, testpath): +def test_move(fs, tempdir, testpath): # move directory source = testpath('source-dir') source_ = tempdir / 'source-dir' @@ -155,7 +156,7 @@ def test_mv(fs, tempdir, testpath): target = testpath('target-dir') target_ = tempdir / 'target-dir' assert not target_.exists() - fs.mv(source, target) + fs.move(source, target) assert not source_.exists() assert target_.exists() @@ -166,17 +167,17 @@ def test_mv(fs, tempdir, testpath): target = testpath('target-file') target_ = tempdir / 'target-file' assert not target_.exists() - fs.mv(source, target) + fs.move(source, target) assert not source_.exists() assert target_.exists() -def test_rm(fs, tempdir, testpath): +def test_delete_file(fs, tempdir, testpath): target = testpath('target-file') target_ = tempdir / 'target-file' target_.touch() assert target_.exists() - fs.rm(target) + fs.delete_file(target) assert not target_.exists() nested = testpath('nested/target-file') @@ -184,40 +185,44 @@ def test_rm(fs, tempdir, testpath): nested_.parent.mkdir() nested_.touch() assert nested_.exists() - fs.rm(nested) + fs.delete_file(nested) assert not nested_.exists() +def identity(v): + return v + + @pytest.mark.parametrize( ('compression', 'buffer_size', 'compressor'), [ - (None, None, lambda s: s), - (None, 64, lambda s: s), + (None, None, identity), + (None, 64, identity), ('gzip', None, gzip_compress), ('gzip', 256, gzip_compress), ] ) -def test_input_stream(fs, tempdir, testpath, compression, buffer_size, - compressor): +def test_open_input_stream(fs, tempdir, testpath, compression, buffer_size, + compressor): file = testpath('abc') file_ = tempdir / 'abc' data = b'some data' * 1024 file_.write_bytes(compressor(data)) - with fs.input_stream(file, compression, buffer_size) as f: + with fs.open_input_stream(file, compression, buffer_size) as f: result = f.read() assert result == data -def test_input_file(fs, tempdir, testpath): +def test_open_input_file(fs, tempdir, testpath): file = testpath('abc') file_ = tempdir / 'abc' data = b'some data' * 1024 file_.write_bytes(data) read_from = len(b'some data') * 512 - with fs.input_file(file) as f: + with fs.open_input_file(file) as f: f.seek(read_from) result = f.read() @@ -227,19 +232,19 @@ def test_input_file(fs, tempdir, testpath): @pytest.mark.parametrize( ('compression', 'buffer_size', 'decompressor'), [ - (None, None, lambda s: s), - (None, 64, lambda s: s), + (None, None, identity), + (None, 64, identity), ('gzip', None, gzip_decompress), ('gzip', 256, gzip_decompress), ] ) -def test_output_stream(fs, tempdir, testpath, compression, buffer_size, - decompressor): +def test_open_output_stream(fs, tempdir, testpath, compression, buffer_size, + decompressor): file = testpath('abc') file_ = tempdir / 'abc' data = b'some data' * 1024 - with fs.output_stream(file, compression, buffer_size) as f: + with fs.open_output_stream(file, compression, buffer_size) as f: f.write(data) assert decompressor(file_.read_bytes()) == data @@ -248,19 +253,19 @@ def test_output_stream(fs, tempdir, testpath, compression, buffer_size, @pytest.mark.parametrize( ('compression', 'buffer_size', 'compressor', 'decompressor'), [ - (None, None, lambda s: s, lambda s: s), - (None, 64, lambda s: s, lambda s: s), + (None, None, identity, identity), + (None, 64, identity, identity), ('gzip', None, gzip_compress, gzip_decompress), ('gzip', 256, gzip_compress, gzip_decompress), ] ) -def test_append_stream(fs, tempdir, testpath, compression, buffer_size, - compressor, decompressor): +def test_open_append_stream(fs, tempdir, testpath, compression, buffer_size, + compressor, decompressor): file = testpath('abc') file_ = tempdir / 'abc' file_.write_bytes(compressor(b'already existing')) - with fs.append_stream(file, compression, buffer_size) as f: + with fs.open_append_stream(file, compression, buffer_size) as f: f.write(b'\nnewly added') assert decompressor(file_.read_bytes()) == b'already existing\nnewly added' From cc257210202dd89c03b1674a7cd8d8a5d8c49d5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 4 Sep 2019 15:22:10 +0200 Subject: [PATCH 06/36] inlining --- cpp/src/arrow/python/datetime.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 6fc798c0ec3..3fb0a7db0fe 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -170,7 +170,7 @@ int64_t PyTime_to_us(PyObject* pytime) { // total = next * quotient + remaining. Handles negative values by propagating // them: If total is negative, next will be negative and remaining will // always be non-negative. -static int64_t split_time(int64_t total, int64_t quotient, int64_t* next) { +static inline int64_t split_time(int64_t total, int64_t quotient, int64_t* next) { int64_t r = total % quotient; if (r < 0) { *next = total / quotient - 1; @@ -181,7 +181,7 @@ static int64_t split_time(int64_t total, int64_t quotient, int64_t* next) { } } -static Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, +static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, int64_t* hour, int64_t* minute, int64_t* second, int64_t* microsecond) { switch (unit) { @@ -209,7 +209,7 @@ static Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, return Status::OK(); } -static Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year, +static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year, int64_t* month, int64_t* day) { switch (unit) { case DateUnit::MILLI: From e812dca035dfe3f456ccdca1a3bdf7f7476464a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 4 Sep 2019 16:13:55 +0200 Subject: [PATCH 07/36] docstrings --- python/pyarrow/_fs.pyx | 217 +++++++++++++++++++++++++++++++- python/pyarrow/io.pxi | 5 +- python/pyarrow/tests/test_fs.py | 6 +- 3 files changed, 223 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index b6cb4bcaf49..d82caafea18 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -50,6 +50,7 @@ cpdef enum FileType: cdef class FileStats: + """FileSystem entry stats""" cdef CFileStats stats @@ -64,6 +65,20 @@ cdef class FileStats: @property def type(self): + """Type of the file + + The returned enum variants have the following meanings: + - FileType.NonExistent: target does not exist + - FileType.Uknown: target exists but its type is unknown (could be a + special file such as a Unix socket or character + device, or Windows NUL / CON / ...) + - FileType.File: target is a regular file + - FileType.Directory: target is a regular directory + + Returns + ------- + type : FileType + """ cdef CFileType ctype = self.stats.type() if ctype == CFileType_NonExistent: return FileType.NonExistent @@ -78,29 +93,66 @@ cdef class FileStats: @property def path(self): + """The full file path in the filesystem.""" return pathlib.Path(frombytes(self.stats.path())) @property def base_name(self): + """The file base name + + Component after the last directory separator. + """ return frombytes(self.stats.base_name()) @property def size(self): + """The size in bytes, if available + + Only regular files are guaranteed to have a size. + """ + if self.stats.type() != CFileType_File: + raise ValueError( + 'Only regular files are guaranteed to have a size' + ) return self.stats.size() @property def extension(self): + """The file extension""" return frombytes(self.stats.extension()) @property def mtime(self): + """The time of last modification, if available. + + Returns + ------- + mtime : datetime.datetime + """ cdef PyObject *out check_status(PyDateTime_from_TimePoint(self.stats.mtime(), &out)) return PyObject_to_object(out) cdef class Selector: - + """File and directory selector. + + It containt a set of options that describes how to search for files and + directories. + + Parameters + ---------- + base_dir : Union[str, pathlib.Path] + The directory in which to select files. + If the path exists but doesn't point to a directory, this should be an + error. + allow_non_existent : bool, default False + The behavior if `base_dir` doesn't exist in the filesystem. + If false, an error is returned. + If true, an empty selection is returned. + recursive : bool, default False + Whether to recurse into subdirectories. + """ cdef CSelector selector def __init__(self, base_dir='', bint allow_non_existent=False, @@ -135,6 +187,7 @@ cdef class Selector: cdef class FileSystem: + """Abstract file system API""" cdef: shared_ptr[CFileSystem] wrapped @@ -148,6 +201,24 @@ cdef class FileSystem: self.fs = wrapped.get() def get_target_stats(self, paths_or_selector): + """Get statistics for the given target. + + Any symlink is automatically dereferenced, recursively. A non-existing + or unreachable file returns an Ok status and has a FileType of value + NonExistent. An error status indicates a truly exceptional condition + (low-level I/O error, etc.). + + Parameters + ---------- + paths_or_selector: Union[Selector, List[Union[str, pathlib.Path]]] + Either a Selector object or a list of path-like objects. + The selector's base directory will not be part of the results, even + if it exists. If it doesn't exist, use `allow_non_existent`. + + Returns + ------- + file_stats : List[FileStats] + """ cdef: vector[CFileStats] stats vector[c_string] paths @@ -165,18 +236,71 @@ cdef class FileSystem: return [FileStats.wrap(stat) for stat in stats] def create_dir(self, path, bint recursive=True): - check_status(self.fs.CreateDir(_path_to_bytes(path), recursive=recursive)) + """Create a directory and subdirectories. + + This function succeeds if the directory already exists. + + Parameters + ---------- + path : Union[str, pathlib.Path] + The path of the new directory. + recursive: bool, default True + Create nested directories as well. + """ + check_status( + self.fs.CreateDir(_path_to_bytes(path), recursive=recursive) + ) def delete_dir(self, path): + """Delete a directory and its contents, recursively. + + Parameters + ---------- + path : Union[str, pathlib.Path] + The path of the directory to be deleted. + """ check_status(self.fs.DeleteDir(_path_to_bytes(path))) def move(self, src, dest): + """Move / rename a file or directory. + + If the destination exists: + - if it is a non-empty directory, an error is returned + - otherwise, if it has the same type as the source, it is replaced + - otherwise, behavior is unspecified (implementation-dependent). + + Parameters + ---------- + src : Union[str, pathlib.Path] + The path of the file or the directory to be moved. + dest : Union[str, pathlib.Path] + The destination path where the file or directory is moved to. + """ check_status(self.fs.Move(_path_to_bytes(src), _path_to_bytes(dest))) def copy_file(self, src, dest): + """Copy a file. + + If the destination exists and is a directory, an error is returned. + Otherwise, it is replaced. + + Parameters + ---------- + src : Union[str, pathlib.Path] + The path of the file to be copied from. + dest : Union[str, pathlib.Path] + The destination path where the file is copied to. + """ check_status(self.fs.CopyFile(_path_to_bytes(src), _path_to_bytes(dest))) def delete_file(self, path): + """Delete a file. + + Parameters + ---------- + path : Union[str, pathlib.Path] + The path of the file to be deleted. + """ check_status(self.fs.DeleteFile(_path_to_bytes(path))) def _wrap_input_stream(self, stream, path, compression, buffer_size): @@ -198,6 +322,17 @@ cdef class FileSystem: return stream def open_input_file(self, path): + """Open an input file for random access reading. + + Parameters + ---------- + path : Union[str, pathlib.Path] + The source to open for reading. + + Returns + ------- + stram : NativeFile + """ cdef: c_string pathstr = _path_to_bytes(path) NativeFile stream = NativeFile() @@ -211,6 +346,26 @@ cdef class FileSystem: return stream def open_input_stream(self, path, compression='detect', buffer_size=None): + """Open an input stream for sequential reading. + + Parameters + ---------- + source: Union[str, pathlib.Path] + The source to open for reading. + compression: Optional[str], default 'detect' + The compression algorithm to use for on-the-fly decompression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size: Optional[int], default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. + + Returns + ------- + stream : NativeFile + """ cdef: c_string pathstr = _path_to_bytes(path) NativeFile stream = NativeFile() @@ -227,6 +382,28 @@ cdef class FileSystem: ) def open_output_stream(self, path, compression='detect', buffer_size=None): + """Open an output stream for sequential writing. + + If the target already exists, existing data is truncated. + + Parameters + ---------- + path : Union[str, pathlib.Path] + The source to open for writing. + compression: Optional[str], default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size: Optional[int], default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + + Returns + ------- + stream : NativeFile + """ cdef: c_string pathstr = _path_to_bytes(path) NativeFile stream = NativeFile() @@ -243,6 +420,28 @@ cdef class FileSystem: ) def open_append_stream(self, path, compression='detect', buffer_size=None): + """Open an output stream for appending. + + If the target doesn't exist, a new empty file is created. + + Parameters + ---------- + path : Union[str, pathlib.Path] + The source to open for writing. + compression: Optional[str], default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size: Optional[int], default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + + Returns + ------- + stream : NativeFile + """ cdef: c_string pathstr = _path_to_bytes(path) NativeFile stream = NativeFile() @@ -260,6 +459,11 @@ cdef class FileSystem: cdef class LocalFileSystem(FileSystem): + """A FileSystem implementation accessing files on the local machine. + + Details such as symlinks are abstracted away (symlinks are always followed, + except when deleting an entry). + """ cdef: CLocalFileSystem* localfs @@ -275,6 +479,15 @@ cdef class LocalFileSystem(FileSystem): cdef class SubTreeFileSystem(FileSystem): + """Delegates to another implementation after prepending a fixed base path. + + This is useful to expose a logical view of a subtree of a filesystem, + for example a directory in a LocalFileSystem. + + Note, that this makes no security guarantee. For example, symlinks may + allow to "escape" the subtree and access other parts of the underlying + filesystem. + """ cdef: CSubTreeFileSystem* subtreefs diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 3f5f2ff3411..125fc63585a 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1610,7 +1610,7 @@ def input_stream(source, compression='detect', buffer_size=None): ---------- source: str, Path, buffer, file-like object, ... The source to open for reading - compression: str or None + compression: Optional[str], default 'detect' The compression algorithm to use for on-the-fly decompression. If "detect" and source is a file path, then compression will be chosen based on the file extension. @@ -1642,6 +1642,7 @@ def input_stream(source, compression='detect', buffer_size=None): .format(source.__class__)) if compression == 'detect': + # detect for OSFile too compression = _detect_compression(source_path) if buffer_size is not None and buffer_size != 0: @@ -1661,7 +1662,7 @@ def output_stream(source, compression='detect', buffer_size=None): ---------- source: str, Path, buffer, file-like object, ... The source to open for writing - compression: str or None + compression: Optional[str], default 'detect' The compression algorithm to use for on-the-fly compression. If "detect" and source is a file path, then compression will be chosen based on the file extension. diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index c79fc02a73d..3da1e62337b 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -64,7 +64,7 @@ def test_get_target_stats(fs, tempdir, testpath): aaa_.mkdir(parents=True) bb_.touch() - c_.touch() + c_.write_text('test') def ceiled_mtime_range(path): # arrow's filesystem implementation ceils mtime whereas pathlib rounds @@ -80,17 +80,21 @@ def ceiled_mtime_range(path): assert aaa_stat.extension == '' assert aaa_stat.type == FileType.Directory assert aaa_stat.mtime in ceiled_mtime_range(aaa_) + with pytest.raises(ValueError): + aaa_stat.size assert bb_stat.path == pathlib.Path(bb) assert bb_stat.base_name == 'bb' assert bb_stat.extension == '' assert bb_stat.type == FileType.File + assert bb_stat.size == 0 assert bb_stat.mtime in ceiled_mtime_range(bb_) assert c_stat.path == pathlib.Path(c) assert c_stat.base_name == 'c.txt' assert c_stat.extension == 'txt' assert c_stat.type == FileType.File + assert c_stat.size == 4 assert c_stat.mtime in ceiled_mtime_range(c_) selector = Selector(testpath(''), allow_non_existent=False, recursive=True) From 74598ebea601e365900756a18ee1bd0c99af1ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 4 Sep 2019 16:57:05 +0200 Subject: [PATCH 08/36] python2 --- cpp/src/arrow/python/datetime.cc | 1 + python/pyarrow/tests/test_fs.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 3fb0a7db0fe..5e8a3d69bcf 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -231,6 +231,7 @@ Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) { } Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) { + PyDateTime_IMPORT; int64_t year = 0, month = 0, day = 0; RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day)); *out = PyDate_FromDate(static_cast(year), static_cast(month), diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 3da1e62337b..5310f5233e5 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -64,7 +64,7 @@ def test_get_target_stats(fs, tempdir, testpath): aaa_.mkdir(parents=True) bb_.touch() - c_.write_text('test') + c_.write_bytes('test') def ceiled_mtime_range(path): # arrow's filesystem implementation ceils mtime whereas pathlib rounds From b73300c9e23df80c26ef9c807d858921efd4ad5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 4 Sep 2019 17:04:08 +0200 Subject: [PATCH 09/36] write bytes in the test case --- python/pyarrow/tests/test_fs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 5310f5233e5..073f25d047c 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -64,7 +64,7 @@ def test_get_target_stats(fs, tempdir, testpath): aaa_.mkdir(parents=True) bb_.touch() - c_.write_bytes('test') + c_.write_bytes(b'test') def ceiled_mtime_range(path): # arrow's filesystem implementation ceils mtime whereas pathlib rounds From 545a2c6ee70213674a6582fe0831394eb831c91b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 5 Sep 2019 12:05:36 +0200 Subject: [PATCH 10/36] move to internal namespace --- cpp/src/arrow/python/arrow_to_pandas.cc | 4 ++-- cpp/src/arrow/python/datetime.cc | 2 ++ cpp/src/arrow/python/datetime.h | 2 ++ cpp/src/arrow/python/deserialize.cc | 2 +- cpp/src/arrow/python/python_to_arrow.cc | 20 ++++++++++---------- cpp/src/arrow/python/serialize.cc | 2 +- python/pyarrow/includes/libarrow.pxd | 15 +++++++++------ 7 files changed, 27 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index ceddfd5d329..539d82eae8e 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -654,7 +654,7 @@ static Status ConvertDates(const PandasOptions& options, const ChunkedArray& dat PyDateTime_IMPORT; } auto WrapValue = [](typename Type::c_type value, PyObject** out) { - RETURN_NOT_OK(PyDate_from_int(value, Type::UNIT, out)); + RETURN_NOT_OK(internal::PyDate_from_int(value, Type::UNIT, out)); RETURN_IF_PYERROR(); return Status::OK(); }; @@ -672,7 +672,7 @@ static Status ConvertTimes(const PandasOptions& options, const ChunkedArray& dat const TimeUnit::type unit = checked_cast(*data.type()).unit(); auto WrapValue = [unit](typename Type::c_type value, PyObject** out) { - RETURN_NOT_OK(PyTime_from_int(value, unit, out)); + RETURN_NOT_OK(internal::PyTime_from_int(value, unit, out)); RETURN_IF_PYERROR(); return Status::OK(); }; diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 5e8a3d69bcf..b2bb78c548d 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -28,6 +28,7 @@ namespace arrow { namespace py { +namespace internal { // The following code is adapted from // https://github.com/numpy/numpy/blob/master/numpy/core/src/multiarray/datetime.c @@ -282,5 +283,6 @@ int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) { (PyDate_to_ms(reinterpret_cast(pydatetime)) / 1000LL); } +} // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index e00c44642d2..3ffc174dc6f 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -28,6 +28,7 @@ namespace arrow { namespace py { +namespace internal { int64_t PyTime_to_us(PyObject* pytime); @@ -70,6 +71,7 @@ inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) { return PyDateTime_to_us(pydatetime) * 1000; } +} // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/deserialize.cc b/cpp/src/arrow/python/deserialize.cc index 7cad8c9682e..fb07442a8d9 100644 --- a/cpp/src/arrow/python/deserialize.cc +++ b/cpp/src/arrow/python/deserialize.cc @@ -154,7 +154,7 @@ Status GetValue(PyObject* context, const Array& arr, int64_t index, int8_t type, *result = PyFloat_FromDouble(checked_cast(arr).Value(index)); return Status::OK(); case PythonType::DATE64: { - RETURN_NOT_OK(PyDateTime_from_int( + RETURN_NOT_OK(internal::PyDateTime_from_int( checked_cast(arr).Value(index), TimeUnit::MICRO, result)); RETURN_IF_PYERROR(); return Status::OK(); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index f7b7b169c7f..f48d32bf402 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -301,7 +301,7 @@ class Date32Converter int32_t t; if (PyDate_Check(obj)) { auto pydate = reinterpret_cast(obj); - t = static_cast(PyDate_to_days(pydate)); + t = static_cast(internal::PyDate_to_days(pydate)); } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for date32")); } @@ -322,7 +322,7 @@ class Date64Converter t -= t % 86400000LL; } else if (PyDate_Check(obj)) { auto pydate = reinterpret_cast(obj); - t = PyDate_to_ms(pydate); + t = internal::PyDate_to_ms(pydate); } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for date64")); } @@ -343,10 +343,10 @@ class Time32Converter // datetime.time stores microsecond resolution switch (unit_) { case TimeUnit::SECOND: - t = static_cast(PyTime_to_s(obj)); + t = static_cast(internal::PyTime_to_s(obj)); break; case TimeUnit::MILLI: - t = static_cast(PyTime_to_ms(obj)); + t = static_cast(internal::PyTime_to_ms(obj)); break; default: return Status::UnknownError("Invalid time unit"); @@ -373,10 +373,10 @@ class Time64Converter // datetime.time stores microsecond resolution switch (unit_) { case TimeUnit::MICRO: - t = PyTime_to_us(obj); + t = internal::PyTime_to_us(obj); break; case TimeUnit::NANO: - t = PyTime_to_ns(obj); + t = internal::PyTime_to_ns(obj); break; default: return Status::UnknownError("Invalid time unit"); @@ -404,16 +404,16 @@ class TimestampConverter switch (unit_) { case TimeUnit::SECOND: - t = PyDateTime_to_s(pydatetime); + t = internal::PyDateTime_to_s(pydatetime); break; case TimeUnit::MILLI: - t = PyDateTime_to_ms(pydatetime); + t = internal::PyDateTime_to_ms(pydatetime); break; case TimeUnit::MICRO: - t = PyDateTime_to_us(pydatetime); + t = internal::PyDateTime_to_us(pydatetime); break; case TimeUnit::NANO: - t = PyDateTime_to_ns(pydatetime); + t = internal::PyDateTime_to_ns(pydatetime); break; default: return Status::UnknownError("Invalid time unit"); diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index a5a7dada584..a344e90ed72 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -466,7 +466,7 @@ Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder, RETURN_NOT_OK(builder->AppendNone()); } else if (PyDateTime_Check(elem)) { PyDateTime_DateTime* datetime = reinterpret_cast(elem); - RETURN_NOT_OK(builder->AppendDate64(PyDateTime_to_us(datetime))); + RETURN_NOT_OK(builder->AppendDate64(internal::PyDateTime_to_us(datetime))); } else if (is_buffer(elem)) { RETURN_NOT_OK(builder->AppendBuffer(static_cast(blobs_out->buffers.size()))); std::shared_ptr buffer; diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 5d632217de8..e3a9d199b36 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1281,12 +1281,6 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: object PyHalf_FromHalf(npy_half value) - cdef cppclass CTimePoint "arrow::py::TimePoint": - pass - - cdef CStatus PyDateTime_from_int(int64_t val, const TimeUnit unit, PyObject** out) - cdef CStatus PyDateTime_from_TimePoint(CTimePoint val, PyObject** out) - cdef cppclass PyConversionOptions: PyConversionOptions() @@ -1414,6 +1408,15 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CSerializedPyObject* out) +cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: + + cdef cppclass CTimePoint "arrow::py::internal::TimePoint": + pass + + cdef CStatus PyDateTime_from_int(int64_t val, const TimeUnit unit, PyObject** out) + cdef CStatus PyDateTime_from_TimePoint(CTimePoint val, PyObject** out) + + cdef extern from 'arrow/python/init.h': int arrow_init_numpy() except -1 From e24a036f99859a2dbf4c5f051191cc764d0cbb87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 5 Sep 2019 12:18:54 +0200 Subject: [PATCH 11/36] raise on invalid path --- python/pyarrow/_fs.pyx | 14 +++++++++++--- python/pyarrow/tests/test_fs.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index d82caafea18..2aebdec2f6e 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -22,6 +22,8 @@ try: except ImportError: import pathlib2 as pathlib # py2 compat +import six + from pyarrow.compat import frombytes, tobytes from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport PyDateTime_from_TimePoint @@ -37,9 +39,15 @@ from pyarrow.lib cimport ( ) -cdef inline c_string _path_to_bytes(p): - # supports types: byte, str, pathlib.Path - return tobytes(str(p)) +cdef inline c_string _path_to_bytes(path) except *: + if isinstance(path, six.binary_type): + return path + elif isinstance(path, six.string_types): + return tobytes(path) + elif isinstance(path, pathlib.Path): + return tobytes(str(path)) + else: + raise TypeError('Path must be a string or an instance of pathlib.Path') cpdef enum FileType: diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 073f25d047c..8c6a9a09909 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -57,6 +57,16 @@ def testpath(request, fs, tempdir): return lambda path: request.param(tempdir / path) +def test_non_path_like_input_raises(fs): + class Path: + pass + + invalid_paths = [1, 1.1, Path(), tuple(), {}, [], lambda: 1] + for path in invalid_paths: + with pytest.raises(TypeError): + fs.create_dir(path) + + def test_get_target_stats(fs, tempdir, testpath): aaa, aaa_ = testpath('a/aa/aaa'), tempdir / 'a' / 'aa' / 'aaa' bb, bb_ = testpath('a/bb'), tempdir / 'a' / 'bb' From cd8b3872fc87dc3d80daaaed1d3f977b7f4ca153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 5 Sep 2019 15:16:22 +0200 Subject: [PATCH 12/36] ARROW_PYTHON_EXPORT --- cpp/src/arrow/python/datetime.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index 3ffc174dc6f..a95f47e3058 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -22,6 +22,7 @@ #include #include "arrow/python/platform.h" +#include "arrow/python/visibility.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/logging.h" @@ -30,43 +31,59 @@ namespace arrow { namespace py { namespace internal { +ARROW_PYTHON_EXPORT int64_t PyTime_to_us(PyObject* pytime); +ARROW_PYTHON_EXPORT inline int64_t PyTime_to_s(PyObject* pytime) { return PyTime_to_us(pytime) / 1000000; } +ARROW_PYTHON_EXPORT inline int64_t PyTime_to_ms(PyObject* pytime) { return PyTime_to_us(pytime) / 1000; } +ARROW_PYTHON_EXPORT inline int64_t PyTime_to_ns(PyObject* pytime) { return PyTime_to_us(pytime) * 1000; } +ARROW_PYTHON_EXPORT Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out); + +ARROW_PYTHON_EXPORT Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out); + +ARROW_PYTHON_EXPORT Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out); -using TimePoint = - std::chrono::time_point; +using TimePoint = ARROW_PYTHON_EXPORT + std::chrono::time_point; +ARROW_PYTHON_EXPORT Status PyDateTime_from_TimePoint(TimePoint val, PyObject** out); +ARROW_PYTHON_EXPORT int64_t PyDate_to_days(PyDateTime_Date* pydate); +ARROW_PYTHON_EXPORT inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { return PyDate_to_days(pydate) * 24 * 3600 * 1000; } +ARROW_PYTHON_EXPORT int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime); +ARROW_PYTHON_EXPORT inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) { int64_t date_ms = PyDateTime_to_s(pydatetime) * 1000; int ms = PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000; return date_ms + ms; } +ARROW_PYTHON_EXPORT inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) { int64_t ms = PyDateTime_to_s(pydatetime) * 1000; int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime); return ms * 1000 + us; } +ARROW_PYTHON_EXPORT inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) { return PyDateTime_to_us(pydatetime) * 1000; } From 8ba3c6078ed715f96cfd576e81536a71917f48c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 6 Sep 2019 09:29:36 +0200 Subject: [PATCH 13/36] don't export --- cpp/src/arrow/python/datetime.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index a95f47e3058..a5f658458ce 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -52,7 +52,7 @@ Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out); ARROW_PYTHON_EXPORT Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out); -using TimePoint = ARROW_PYTHON_EXPORT +using TimePoint = std::chrono::time_point; ARROW_PYTHON_EXPORT From 299c7028bda2c60e61445b7637c7f8f8ea5a0d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 9 Sep 2019 12:03:08 +0200 Subject: [PATCH 14/36] use internal namespace --- cpp/src/arrow/python/python_to_arrow.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index f48d32bf402..2b3c9ffebf5 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -317,7 +317,7 @@ class Date64Converter int64_t t; if (PyDateTime_Check(obj)) { auto pydate = reinterpret_cast(obj); - t = PyDateTime_to_ms(pydate); + t = internal::PyDateTime_to_ms(pydate); // Truncate any intraday milliseconds t -= t % 86400000LL; } else if (PyDate_Check(obj)) { From 76993d2b4ce27ebee03acf3274a0b262c05a0a92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 10 Sep 2019 14:25:21 +0200 Subject: [PATCH 15/36] review comments --- python/pyarrow/_fs.pyx | 88 +++++++++++++++++++-------------- python/pyarrow/tests/test_fs.py | 2 +- 2 files changed, 52 insertions(+), 38 deletions(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 2aebdec2f6e..f6fdf0a7f33 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -45,14 +45,14 @@ cdef inline c_string _path_to_bytes(path) except *: elif isinstance(path, six.string_types): return tobytes(path) elif isinstance(path, pathlib.Path): - return tobytes(str(path)) + return tobytes(path.as_posix()) else: raise TypeError('Path must be a string or an instance of pathlib.Path') cpdef enum FileType: NonExistent - Uknown + Unknown File Directory @@ -77,9 +77,9 @@ cdef class FileStats: The returned enum variants have the following meanings: - FileType.NonExistent: target does not exist - - FileType.Uknown: target exists but its type is unknown (could be a - special file such as a Unix socket or character - device, or Windows NUL / CON / ...) + - FileType.Unknown: target exists but its type is unknown (could be a + special file such as a Unix socket or character + device, or Windows NUL / CON / ...) - FileType.File: target is a regular file - FileType.Directory: target is a regular directory @@ -91,7 +91,7 @@ cdef class FileStats: if ctype == CFileType_NonExistent: return FileType.NonExistent elif ctype == CFileType_Unknown: - return FileType.Uknown + return FileType.Unknown elif ctype == CFileType_File: return FileType.File elif ctype == CFileType_Directory: @@ -150,7 +150,7 @@ cdef class Selector: Parameters ---------- - base_dir : Union[str, pathlib.Path] + base_dir : str or pathlib.Path The directory in which to select files. If the path exists but doesn't point to a directory, this should be an error. @@ -212,20 +212,20 @@ cdef class FileSystem: """Get statistics for the given target. Any symlink is automatically dereferenced, recursively. A non-existing - or unreachable file returns an Ok status and has a FileType of value - NonExistent. An error status indicates a truly exceptional condition + or unreachable file returns a FileStats object and has a FileType of + value NonExistent. An exception indicates a truly exceptional condition (low-level I/O error, etc.). Parameters ---------- - paths_or_selector: Union[Selector, List[Union[str, pathlib.Path]]] + paths_or_selector: Selector or list of path-likes Either a Selector object or a list of path-like objects. The selector's base directory will not be part of the results, even if it exists. If it doesn't exist, use `allow_non_existent`. Returns ------- - file_stats : List[FileStats] + file_stats : list of FileStats """ cdef: vector[CFileStats] stats @@ -233,11 +233,13 @@ cdef class FileSystem: CSelector selector if isinstance(paths_or_selector, Selector): - selector = (paths_or_selector).selector - check_status(self.fs.GetTargetStats(selector, &stats)) + with nogil: + selector = (paths_or_selector).selector + check_status(self.fs.GetTargetStats(selector, &stats)) elif isinstance(paths_or_selector, (list, tuple)): paths = [_path_to_bytes(s) for s in paths_or_selector] - check_status(self.fs.GetTargetStats(paths, &stats)) + with nogil: + check_status(self.fs.GetTargetStats(paths, &stats)) else: raise TypeError('Must pass either paths or a Selector') @@ -250,24 +252,26 @@ cdef class FileSystem: Parameters ---------- - path : Union[str, pathlib.Path] + path : str or pathlib.Path The path of the new directory. recursive: bool, default True Create nested directories as well. """ - check_status( - self.fs.CreateDir(_path_to_bytes(path), recursive=recursive) - ) + cdef c_string directory = _path_to_bytes(path) + with nogil: + check_status(self.fs.CreateDir(directory, recursive=recursive)) def delete_dir(self, path): """Delete a directory and its contents, recursively. Parameters ---------- - path : Union[str, pathlib.Path] + path : str or pathlib.Path The path of the directory to be deleted. """ - check_status(self.fs.DeleteDir(_path_to_bytes(path))) + cdef c_string directory = _path_to_bytes(path) + with nogil: + check_status(self.fs.DeleteDir(directory)) def move(self, src, dest): """Move / rename a file or directory. @@ -279,12 +283,16 @@ cdef class FileSystem: Parameters ---------- - src : Union[str, pathlib.Path] + src : str or pathlib.Path The path of the file or the directory to be moved. - dest : Union[str, pathlib.Path] + dest : str or pathlib.Path The destination path where the file or directory is moved to. """ - check_status(self.fs.Move(_path_to_bytes(src), _path_to_bytes(dest))) + cdef: + c_string source = _path_to_bytes(src) + c_string destination = _path_to_bytes(dest) + with nogil: + check_status(self.fs.Move(source, destination)) def copy_file(self, src, dest): """Copy a file. @@ -294,22 +302,28 @@ cdef class FileSystem: Parameters ---------- - src : Union[str, pathlib.Path] + src : str or pathlib.Path The path of the file to be copied from. - dest : Union[str, pathlib.Path] + dest : str or pathlib.Path The destination path where the file is copied to. """ - check_status(self.fs.CopyFile(_path_to_bytes(src), _path_to_bytes(dest))) + cdef: + c_string source = _path_to_bytes(src) + c_string destination = _path_to_bytes(dest) + with nogil: + check_status(self.fs.CopyFile(source, destination)) def delete_file(self, path): """Delete a file. Parameters ---------- - path : Union[str, pathlib.Path] + path : str or pathlib.Path The path of the file to be deleted. """ - check_status(self.fs.DeleteFile(_path_to_bytes(path))) + cdef c_string file = _path_to_bytes(path) + with nogil: + check_status(self.fs.DeleteFile(file)) def _wrap_input_stream(self, stream, path, compression, buffer_size): if buffer_size is not None and buffer_size != 0: @@ -358,15 +372,15 @@ cdef class FileSystem: Parameters ---------- - source: Union[str, pathlib.Path] + source: str or pathlib.Path The source to open for reading. - compression: Optional[str], default 'detect' + compression: str optional, default 'detect' The compression algorithm to use for on-the-fly decompression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: Optional[int], default None + buffer_size: int optional, default None If None or 0, no buffering will happen. Otherwise the size of the temporary read buffer. @@ -396,15 +410,15 @@ cdef class FileSystem: Parameters ---------- - path : Union[str, pathlib.Path] + path : str or pathlib.Path The source to open for writing. - compression: Optional[str], default 'detect' + compression: str optional, default 'detect' The compression algorithm to use for on-the-fly compression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: Optional[int], default None + buffer_size: int optional, default None If None or 0, no buffering will happen. Otherwise the size of the temporary write buffer. @@ -434,15 +448,15 @@ cdef class FileSystem: Parameters ---------- - path : Union[str, pathlib.Path] + path : str or pathlib.Path The source to open for writing. - compression: Optional[str], default 'detect' + compression: str optional, default 'detect' The compression algorithm to use for on-the-fly compression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: Optional[int], default None + buffer_size: int optional, default None If None or 0, no buffering will happen. Otherwise the size of the temporary write buffer. diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 8c6a9a09909..f8ac90fed38 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -47,7 +47,7 @@ def fs(request, tempdir): pytest.param(str, id='str') ]) def testpath(request, fs, tempdir): - # we always use the tempdir for rewding and writing test artifacts, but + # we always use the tempdir for reading and writing test artifacts, but # if the filesystem is wrapped in a SubTreeFileSystem then we don't need # to prepend the path with the tempdir, we also test the API with both # pathlib.Path objects and plain python strings From 43c12bc94919e76aca2818f104bcf7264c41845d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 10 Sep 2019 14:47:28 +0200 Subject: [PATCH 16/36] review comments --- cpp/src/arrow/python/datetime.cc | 20 -------------------- cpp/src/arrow/python/datetime.h | 17 +++++++++++++++-- python/pyarrow/_fs.pyx | 6 ++++++ python/pyarrow/tests/test_fs.py | 18 +++++++++--------- 4 files changed, 30 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index b2bb78c548d..4d4fdab8395 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -157,13 +157,6 @@ static void get_date_from_days(int64_t days, int64_t* date_year, int64_t* date_m return; } -int64_t PyTime_to_us(PyObject* pytime) { - return (static_cast(PyDateTime_TIME_GET_HOUR(pytime)) * 3600000000LL + - static_cast(PyDateTime_TIME_GET_MINUTE(pytime)) * 60000000LL + - static_cast(PyDateTime_TIME_GET_SECOND(pytime)) * 1000000LL + - PyDateTime_TIME_GET_MICROSECOND(pytime)); -} - // Splitting time quantities, for example splitting total seconds into // minutes and remaining seconds. After we run // int64_t remaining = split_time(total, quotient, &next) @@ -261,9 +254,6 @@ using TimePoint = Status PyDateTime_from_TimePoint(TimePoint val, PyObject** out) { auto nanos = val.time_since_epoch(); auto micros = std::chrono::duration_cast(nanos); - if (micros < nanos) { - micros += std::chrono::microseconds(1); // ceil - } RETURN_NOT_OK(PyDateTime_from_int(micros.count(), TimeUnit::MICRO, out)); return Status::OK(); } @@ -273,16 +263,6 @@ int64_t PyDate_to_days(PyDateTime_Date* pydate) { PyDateTime_GET_DAY(pydate)); } -int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) { - int64_t total_seconds = 0; - total_seconds += PyDateTime_DATE_GET_SECOND(pydatetime); - total_seconds += PyDateTime_DATE_GET_MINUTE(pydatetime) * 60; - total_seconds += PyDateTime_DATE_GET_HOUR(pydatetime) * 3600; - - return total_seconds + - (PyDate_to_ms(reinterpret_cast(pydatetime)) / 1000LL); -} - } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index a5f658458ce..7d8644606f2 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -32,7 +32,12 @@ namespace py { namespace internal { ARROW_PYTHON_EXPORT -int64_t PyTime_to_us(PyObject* pytime); +inline int64_t PyTime_to_us(PyObject* pytime) { + return (static_cast(PyDateTime_TIME_GET_HOUR(pytime)) * 3600000000LL + + static_cast(PyDateTime_TIME_GET_MINUTE(pytime)) * 60000000LL + + static_cast(PyDateTime_TIME_GET_SECOND(pytime)) * 1000000LL + + PyDateTime_TIME_GET_MICROSECOND(pytime)); +} ARROW_PYTHON_EXPORT inline int64_t PyTime_to_s(PyObject* pytime) { return PyTime_to_us(pytime) / 1000000; } @@ -67,7 +72,15 @@ inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { } ARROW_PYTHON_EXPORT -int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime); +inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) { + int64_t total_seconds = 0; + total_seconds += PyDateTime_DATE_GET_SECOND(pydatetime); + total_seconds += PyDateTime_DATE_GET_MINUTE(pydatetime) * 60; + total_seconds += PyDateTime_DATE_GET_HOUR(pydatetime) * 3600; + + return total_seconds + + (PyDate_to_ms(reinterpret_cast(pydatetime)) / 1000LL); +} ARROW_PYTHON_EXPORT inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) { diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index f6fdf0a7f33..8296f85d013 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -40,6 +40,12 @@ from pyarrow.lib cimport ( cdef inline c_string _path_to_bytes(path) except *: + """Converts path-like objects to bytestring + + tobytes always uses utf-8, which is more or less ok, at least on Windows + since the C++ side then decodes from utf-8. On Unix, os.fsencode may be + better. + """ if isinstance(path, six.binary_type): return path elif isinstance(path, six.string_types): diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index f8ac90fed38..4cf994a6079 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -76,12 +76,12 @@ def test_get_target_stats(fs, tempdir, testpath): bb_.touch() c_.write_bytes(b'test') - def ceiled_mtime_range(path): - # arrow's filesystem implementation ceils mtime whereas pathlib rounds - return { - datetime.utcfromtimestamp(path.stat().st_mtime), - datetime.utcfromtimestamp(path.stat().st_mtime + 10**-6) - } + def mtime_almost_equal(fs_dt, pathlib_ts): + # arrow's filesystem implementation truncates mtime to microsends + # resolution whereas pathlib rounds + pathlib_dt = datetime.utcfromtimestamp(pathlib_ts) + difference = (fs_dt - pathlib_dt).total_seconds() + return abs(difference) <= 10**-6 aaa_stat, bb_stat, c_stat = fs.get_target_stats([aaa, bb, c]) @@ -89,7 +89,7 @@ def ceiled_mtime_range(path): assert aaa_stat.base_name == 'aaa' assert aaa_stat.extension == '' assert aaa_stat.type == FileType.Directory - assert aaa_stat.mtime in ceiled_mtime_range(aaa_) + assert mtime_almost_equal(aaa_stat.mtime, aaa_.stat().st_mtime) with pytest.raises(ValueError): aaa_stat.size @@ -98,14 +98,14 @@ def ceiled_mtime_range(path): assert bb_stat.extension == '' assert bb_stat.type == FileType.File assert bb_stat.size == 0 - assert bb_stat.mtime in ceiled_mtime_range(bb_) + assert mtime_almost_equal(bb_stat.mtime, bb_.stat().st_mtime) assert c_stat.path == pathlib.Path(c) assert c_stat.base_name == 'c.txt' assert c_stat.extension == 'txt' assert c_stat.type == FileType.File assert c_stat.size == 4 - assert c_stat.mtime in ceiled_mtime_range(c_) + assert mtime_almost_equal(c_stat.mtime, c_.stat().st_mtime) selector = Selector(testpath(''), allow_non_existent=False, recursive=True) nodes = fs.get_target_stats(selector) From 962c34627e7966f59f96e06bceea3450467bf6af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 10 Sep 2019 16:42:43 +0200 Subject: [PATCH 17/36] revert FindClangTools --- cpp/cmake_modules/FindClangTools.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake index be251e7b6c4..a6ecbf78437 100644 --- a/cpp/cmake_modules/FindClangTools.cmake +++ b/cpp/cmake_modules/FindClangTools.cmake @@ -75,7 +75,7 @@ function(FIND_CLANG_TOOL NAME OUTPUT VERSION_CHECK_PATTERN) endfunction() find_clang_tool(clang-tidy CLANG_TIDY_BIN - "LLVM version ${ARROW_LLVM_MAJOR_VERSION}") + "LLVM version ${ARROW_LLVM_MAJOR_VERSION}\\.${ARROW_LLVM_MINOR_VERSION}") if(CLANG_TIDY_BIN) set(CLANG_TIDY_FOUND 1) message(STATUS "clang-tidy found at ${CLANG_TIDY_BIN}") @@ -86,7 +86,7 @@ endif() find_clang_tool( clang-format CLANG_FORMAT_BIN - "^clang-format version ${ARROW_LLVM_MAJOR_VERSION}") + "^clang-format version ${ARROW_LLVM_MAJOR_VERSION}\\.${ARROW_LLVM_MINOR_VERSION}") if(CLANG_FORMAT_BIN) set(CLANG_FORMAT_FOUND 1) message(STATUS "clang-format found at ${CLANG_FORMAT_BIN}") From 0baf5ee4b9bb2e23fb29589fa2b4c5faaf80bea7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 10 Sep 2019 17:00:06 +0200 Subject: [PATCH 18/36] convert string paths to posix too; don't accept bytes paths --- python/pyarrow/_fs.pyx | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 8296f85d013..94739d9741b 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -46,14 +46,11 @@ cdef inline c_string _path_to_bytes(path) except *: since the C++ side then decodes from utf-8. On Unix, os.fsencode may be better. """ - if isinstance(path, six.binary_type): - return path - elif isinstance(path, six.string_types): - return tobytes(path) - elif isinstance(path, pathlib.Path): - return tobytes(path.as_posix()) - else: + if isinstance(path, six.string_types): + path = pathlib.Path(path) + elif not isinstance(path, pathlib.Path): raise TypeError('Path must be a string or an instance of pathlib.Path') + return tobytes(path.as_posix()) cpdef enum FileType: From 88409ed110b3928d99b2d3107f54ebbfec27110d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 10 Sep 2019 17:01:29 +0200 Subject: [PATCH 19/36] remove comment --- cpp/src/arrow/filesystem/filesystem.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index fefe92f05a8..a00fa1269ed 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -115,7 +115,6 @@ struct ARROW_EXPORT FileStats { }; /// \brief EXPERIMENTAL: file selector -// call it FileSelector? struct ARROW_EXPORT Selector { // The directory in which to select files. // If the path exists but doesn't point to a directory, this should be an error. From d898b064c5e10a3ba5be704f3df6fca38fc054fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 10 Sep 2019 17:02:28 +0200 Subject: [PATCH 20/36] io.pxi docstring --- python/pyarrow/io.pxi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 125fc63585a..7f341727e62 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1610,7 +1610,7 @@ def input_stream(source, compression='detect', buffer_size=None): ---------- source: str, Path, buffer, file-like object, ... The source to open for reading - compression: Optional[str], default 'detect' + compression: str optional, default 'detect' The compression algorithm to use for on-the-fly decompression. If "detect" and source is a file path, then compression will be chosen based on the file extension. @@ -1662,7 +1662,7 @@ def output_stream(source, compression='detect', buffer_size=None): ---------- source: str, Path, buffer, file-like object, ... The source to open for writing - compression: Optional[str], default 'detect' + compression: str optional, default 'detect' The compression algorithm to use for on-the-fly compression. If "detect" and source is a file path, then compression will be chosen based on the file extension. From abc51ab2321cc4ba1caff80f4c6d8583962c9303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 11 Sep 2019 12:49:07 +0200 Subject: [PATCH 21/36] clang-format; use posix-style path as basepath for SubTreeFileSystem --- cpp/src/arrow/python/datetime.cc | 5 ++--- cpp/src/arrow/python/datetime.h | 2 +- python/pyarrow/_fs.pyx | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 4d4fdab8395..5a2d1e09ef2 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -15,17 +15,16 @@ // specific language governing permissions and limitations // under the License. -#include #include +#include #include -#include "arrow/python/platform.h" #include "arrow/python/datetime.h" +#include "arrow/python/platform.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/logging.h" - namespace arrow { namespace py { namespace internal { diff --git a/cpp/src/arrow/python/datetime.h b/cpp/src/arrow/python/datetime.h index 7d8644606f2..7af4f7a5ad7 100644 --- a/cpp/src/arrow/python/datetime.h +++ b/cpp/src/arrow/python/datetime.h @@ -58,7 +58,7 @@ ARROW_PYTHON_EXPORT Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out); using TimePoint = - std::chrono::time_point; + std::chrono::time_point; ARROW_PYTHON_EXPORT Status PyDateTime_from_TimePoint(TimePoint val, PyObject** out); diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 94739d9741b..193b9ff8b28 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -522,7 +522,7 @@ cdef class SubTreeFileSystem(FileSystem): c_string pathstr shared_ptr[CSubTreeFileSystem] wrapped - pathstr = tobytes(str(base_path)) + pathstr = _path_to_bytes(base_path) wrapped = make_shared[CSubTreeFileSystem](pathstr, base_fs.wrapped) self.init( wrapped) From 0939d970694ba72fbcf3a3ab97e055d9d2b45c8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 11 Sep 2019 15:57:37 +0200 Subject: [PATCH 22/36] int enum --- cpp/src/arrow/filesystem/filesystem.h | 2 +- python/pyarrow/_fs.pyx | 20 +++++--------------- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index a00fa1269ed..0be5adad92d 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -57,7 +57,7 @@ using TimePoint = std::chrono::time_point; /// \brief EXPERIMENTAL: FileSystem entry type -enum class ARROW_EXPORT FileType { +enum class ARROW_EXPORT FileType : int8_t { // Target does not exist NonExistent, // Target exists but its type is unknown (could be a special file such diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 193b9ff8b28..a76ebafbd50 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -54,10 +54,10 @@ cdef inline c_string _path_to_bytes(path) except *: cpdef enum FileType: - NonExistent - Unknown - File - Directory + NonExistent = CFileType_NonExistent + Unknown = CFileType_Unknown + File = CFileType_File + Directory = CFileType_Directory cdef class FileStats: @@ -90,17 +90,7 @@ cdef class FileStats: ------- type : FileType """ - cdef CFileType ctype = self.stats.type() - if ctype == CFileType_NonExistent: - return FileType.NonExistent - elif ctype == CFileType_Unknown: - return FileType.Unknown - elif ctype == CFileType_File: - return FileType.File - elif ctype == CFileType_Directory: - return FileType.Directory - else: - raise ValueError('Unhandled FileType {}'.format(type)) + return FileType( self.stats.type()) @property def path(self): From de75cb535c2652d1c4120ed863e2ac263ed5ec07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 11 Sep 2019 17:04:40 +0200 Subject: [PATCH 23/36] remove pathlib dependency --- python/pyarrow/_fs.pyx | 48 ++++++++++++++------------------- python/pyarrow/tests/test_fs.py | 6 ++--- 2 files changed, 23 insertions(+), 31 deletions(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index a76ebafbd50..03414c469a9 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -17,17 +17,13 @@ # cython: language_level = 3 -try: - import pathlib -except ImportError: - import pathlib2 as pathlib # py2 compat - import six from pyarrow.compat import frombytes, tobytes from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport PyDateTime_from_TimePoint from pyarrow.includes.libarrow_fs cimport * +from pyarrow.util import _stringify_path from pyarrow.lib import _detect_compression from pyarrow.lib cimport ( check_status, @@ -39,18 +35,14 @@ from pyarrow.lib cimport ( ) -cdef inline c_string _path_to_bytes(path) except *: - """Converts path-like objects to bytestring +cdef inline c_string _as_posix_path(path) except *: + """Converts path-like objects to bytestring with posix separators tobytes always uses utf-8, which is more or less ok, at least on Windows since the C++ side then decodes from utf-8. On Unix, os.fsencode may be better. """ - if isinstance(path, six.string_types): - path = pathlib.Path(path) - elif not isinstance(path, pathlib.Path): - raise TypeError('Path must be a string or an instance of pathlib.Path') - return tobytes(path.as_posix()) + return tobytes(_stringify_path(path).replace('\\', '/')) cpdef enum FileType: @@ -95,7 +87,7 @@ cdef class FileStats: @property def path(self): """The full file path in the filesystem.""" - return pathlib.Path(frombytes(self.stats.path())) + return frombytes(self.stats.path()) @property def base_name(self): @@ -164,11 +156,11 @@ cdef class Selector: @property def base_dir(self): - return pathlib.Path(self.selector.base_dir) + return frombytes(self.selector.base_dir) @base_dir.setter def base_dir(self, base_dir): - self.selector.base_dir = _path_to_bytes(base_dir) + self.selector.base_dir = _as_posix_path(base_dir) @property def allow_non_existent(self): @@ -230,7 +222,7 @@ cdef class FileSystem: selector = (paths_or_selector).selector check_status(self.fs.GetTargetStats(selector, &stats)) elif isinstance(paths_or_selector, (list, tuple)): - paths = [_path_to_bytes(s) for s in paths_or_selector] + paths = [_as_posix_path(s) for s in paths_or_selector] with nogil: check_status(self.fs.GetTargetStats(paths, &stats)) else: @@ -250,7 +242,7 @@ cdef class FileSystem: recursive: bool, default True Create nested directories as well. """ - cdef c_string directory = _path_to_bytes(path) + cdef c_string directory = _as_posix_path(path) with nogil: check_status(self.fs.CreateDir(directory, recursive=recursive)) @@ -262,7 +254,7 @@ cdef class FileSystem: path : str or pathlib.Path The path of the directory to be deleted. """ - cdef c_string directory = _path_to_bytes(path) + cdef c_string directory = _as_posix_path(path) with nogil: check_status(self.fs.DeleteDir(directory)) @@ -282,8 +274,8 @@ cdef class FileSystem: The destination path where the file or directory is moved to. """ cdef: - c_string source = _path_to_bytes(src) - c_string destination = _path_to_bytes(dest) + c_string source = _as_posix_path(src) + c_string destination = _as_posix_path(dest) with nogil: check_status(self.fs.Move(source, destination)) @@ -301,8 +293,8 @@ cdef class FileSystem: The destination path where the file is copied to. """ cdef: - c_string source = _path_to_bytes(src) - c_string destination = _path_to_bytes(dest) + c_string source = _as_posix_path(src) + c_string destination = _as_posix_path(dest) with nogil: check_status(self.fs.CopyFile(source, destination)) @@ -314,7 +306,7 @@ cdef class FileSystem: path : str or pathlib.Path The path of the file to be deleted. """ - cdef c_string file = _path_to_bytes(path) + cdef c_string file = _as_posix_path(path) with nogil: check_status(self.fs.DeleteFile(file)) @@ -349,7 +341,7 @@ cdef class FileSystem: stram : NativeFile """ cdef: - c_string pathstr = _path_to_bytes(path) + c_string pathstr = _as_posix_path(path) NativeFile stream = NativeFile() shared_ptr[CRandomAccessFile] in_handle @@ -382,7 +374,7 @@ cdef class FileSystem: stream : NativeFile """ cdef: - c_string pathstr = _path_to_bytes(path) + c_string pathstr = _as_posix_path(path) NativeFile stream = NativeFile() shared_ptr[CInputStream] in_handle @@ -420,7 +412,7 @@ cdef class FileSystem: stream : NativeFile """ cdef: - c_string pathstr = _path_to_bytes(path) + c_string pathstr = _as_posix_path(path) NativeFile stream = NativeFile() shared_ptr[COutputStream] out_handle @@ -458,7 +450,7 @@ cdef class FileSystem: stream : NativeFile """ cdef: - c_string pathstr = _path_to_bytes(path) + c_string pathstr = _as_posix_path(path) NativeFile stream = NativeFile() shared_ptr[COutputStream] out_handle @@ -512,7 +504,7 @@ cdef class SubTreeFileSystem(FileSystem): c_string pathstr shared_ptr[CSubTreeFileSystem] wrapped - pathstr = _path_to_bytes(base_path) + pathstr = _as_posix_path(base_path) wrapped = make_shared[CSubTreeFileSystem](pathstr, base_fs.wrapped) self.init( wrapped) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 4cf994a6079..5ead2142663 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -85,7 +85,7 @@ def mtime_almost_equal(fs_dt, pathlib_ts): aaa_stat, bb_stat, c_stat = fs.get_target_stats([aaa, bb, c]) - assert aaa_stat.path == pathlib.Path(aaa) + assert aaa_stat.path == pathlib.Path(aaa).as_posix() assert aaa_stat.base_name == 'aaa' assert aaa_stat.extension == '' assert aaa_stat.type == FileType.Directory @@ -93,14 +93,14 @@ def mtime_almost_equal(fs_dt, pathlib_ts): with pytest.raises(ValueError): aaa_stat.size - assert bb_stat.path == pathlib.Path(bb) + assert bb_stat.path == pathlib.Path(bb).as_posix() assert bb_stat.base_name == 'bb' assert bb_stat.extension == '' assert bb_stat.type == FileType.File assert bb_stat.size == 0 assert mtime_almost_equal(bb_stat.mtime, bb_.stat().st_mtime) - assert c_stat.path == pathlib.Path(c) + assert c_stat.path == pathlib.Path(c).as_posix() assert c_stat.base_name == 'c.txt' assert c_stat.extension == 'txt' assert c_stat.type == FileType.File From 4c794a34fd966927145fb68d3a1d30760c33652b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 11 Sep 2019 17:35:01 +0200 Subject: [PATCH 24/36] explain Selector.base_dir; a couple of additional tests --- python/pyarrow/_fs.pyx | 13 ++++++++----- python/pyarrow/tests/test_fs.py | 13 +++++++++++-- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 03414c469a9..1e8407ba99a 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -135,10 +135,11 @@ cdef class Selector: Parameters ---------- - base_dir : str or pathlib.Path - The directory in which to select files. - If the path exists but doesn't point to a directory, this should be an - error. + base_dir : str or pathlib.Path, default '' + The directory in which to select files. Passing an empty string refers + to the root directory of the FileSystem: + - the root directory of the LocalFileSystem + - the base path of the SubTreeFileSystem allow_non_existent : bool, default False The behavior if `base_dir` doesn't exist in the filesystem. If false, an error is returned. @@ -187,7 +188,9 @@ cdef class FileSystem: CFileSystem* fs def __init__(self): - raise TypeError('dont initialize me') + raise TypeError("FileSystem is an abstract class, initialize one of " + "the subclasses instead: LocalFileSystem or " + "SubTreeFileSystem") cdef init(self, const shared_ptr[CFileSystem]& wrapped): self.wrapped = wrapped diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 5ead2142663..340fc31ce3c 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -24,7 +24,8 @@ import pytest from pyarrow import ArrowIOError -from pyarrow.fs import FileType, LocalFileSystem, SubTreeFileSystem, Selector +from pyarrow.fs import (FileType, Selector, FileSystem, LocalFileSystem, + SubTreeFileSystem) from pyarrow.tests.test_io import gzip_compress, gzip_decompress @@ -57,6 +58,11 @@ def testpath(request, fs, tempdir): return lambda path: request.param(tempdir / path) +def test_cannot_instantiate_base_filesystem(): + with pytest.raises(TypeError): + FileSystem() + + def test_non_path_like_input_raises(fs): class Path: pass @@ -107,7 +113,10 @@ def mtime_almost_equal(fs_dt, pathlib_ts): assert c_stat.size == 4 assert mtime_almost_equal(c_stat.mtime, c_.stat().st_mtime) - selector = Selector(testpath(''), allow_non_existent=False, recursive=True) + base_dir = testpath('') + selector = Selector(base_dir, allow_non_existent=False, recursive=True) + assert selector.base_dir == str(base_dir) + nodes = fs.get_target_stats(selector) assert len(nodes) == 5 assert len(list(n for n in nodes if n.type == FileType.File)) == 2 From 4ae995135bb47f6c7837a9388728c045625e9924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 12 Sep 2019 10:05:07 +0200 Subject: [PATCH 25/36] remove default value from selector; test relative paths --- python/pyarrow/_fs.pyx | 12 +++++------- python/pyarrow/tests/test_fs.py | 22 ++++++++++++++++------ 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 1e8407ba99a..beec81b9370 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -135,11 +135,9 @@ cdef class Selector: Parameters ---------- - base_dir : str or pathlib.Path, default '' - The directory in which to select files. Passing an empty string refers - to the root directory of the FileSystem: - - the root directory of the LocalFileSystem - - the base path of the SubTreeFileSystem + base_dir : str or pathlib.Path + The directory in which to select files. Relative paths also work, use + '.' for the current directory and '..' for the parent. allow_non_existent : bool, default False The behavior if `base_dir` doesn't exist in the filesystem. If false, an error is returned. @@ -149,7 +147,7 @@ cdef class Selector: """ cdef CSelector selector - def __init__(self, base_dir='', bint allow_non_existent=False, + def __init__(self, base_dir, bint allow_non_existent=False, bint recursive=False): self.base_dir = base_dir self.recursive = recursive @@ -188,7 +186,7 @@ cdef class FileSystem: CFileSystem* fs def __init__(self): - raise TypeError("FileSystem is an abstract class, initialize one of " + raise TypeError("FileSystem is an abstract class, instantiate one of " "the subclasses instead: LocalFileSystem or " "SubTreeFileSystem") diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 340fc31ce3c..6df35b488be 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -113,14 +113,24 @@ def mtime_almost_equal(fs_dt, pathlib_ts): assert c_stat.size == 4 assert mtime_almost_equal(c_stat.mtime, c_.stat().st_mtime) - base_dir = testpath('') + +@pytest.mark.parametrize('base_dir', ['.', '..']) +def test_get_target_stats_with_selector(fs, tempdir, testpath, base_dir): + base_dir = testpath(base_dir) + base_dir_ = tempdir / base_dir + selector = Selector(base_dir, allow_non_existent=False, recursive=True) - assert selector.base_dir == str(base_dir) + assert selector.base_dir == pathlib.Path(base_dir).as_posix() + + stats = fs.get_target_stats(selector) + assert len(stats) == len(list(base_dir_.iterdir())) - nodes = fs.get_target_stats(selector) - assert len(nodes) == 5 - assert len(list(n for n in nodes if n.type == FileType.File)) == 2 - assert len(list(n for n in nodes if n.type == FileType.Directory)) == 3 + for st in stats: + p = base_dir_ / st.path + if p.is_dir(): + assert st.type == FileType.Directory + if p.is_file(): + assert st.type == FileType.File def test_create_dir(fs, tempdir, testpath): From fadf3a43d1a124266c0984974339f86f440d670d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 12 Sep 2019 10:08:19 +0200 Subject: [PATCH 26/36] typo --- python/pyarrow/_fs.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index beec81b9370..010e08834df 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -130,7 +130,7 @@ cdef class FileStats: cdef class Selector: """File and directory selector. - It containt a set of options that describes how to search for files and + It contains a set of options that describes how to search for files and directories. Parameters From 19a44ade02cae6c6a9dd393832dc5a7a24a47c61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 12 Sep 2019 16:09:58 +0200 Subject: [PATCH 27/36] include symlinks and hidden files in the test glob --- python/pyarrow/tests/test_fs.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 6df35b488be..77fc4662b5e 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import os from datetime import datetime try: import pathlib @@ -116,14 +117,27 @@ def mtime_almost_equal(fs_dt, pathlib_ts): @pytest.mark.parametrize('base_dir', ['.', '..']) def test_get_target_stats_with_selector(fs, tempdir, testpath, base_dir): + def walk(directory): + # utility function to recusively list entries under a directory, + # including symlinks and hidden files + for root, dirs, files in os.walk(directory, followlinks=True): + for d in dirs: + yield os.path.join(root, d) + for f in files: + yield os.path.join(root, f) + base_dir = testpath(base_dir) base_dir_ = tempdir / base_dir selector = Selector(base_dir, allow_non_existent=False, recursive=True) assert selector.base_dir == pathlib.Path(base_dir).as_posix() + (tempdir / 'test_file').touch() + (tempdir / 'test_directory').mkdir() + stats = fs.get_target_stats(selector) - assert len(stats) == len(list(base_dir_.iterdir())) + expected = list(walk(str(base_dir_))) + assert len(stats) == len(expected) for st in stats: p = base_dir_ / st.path From 787eaf40b8042f2dd91c71348aa01fe950bd1194 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 12 Sep 2019 23:43:00 +0200 Subject: [PATCH 28/36] use PurePosixPath --- cpp/src/arrow/python/datetime.cc | 3 --- python/pyarrow/_fs.pyx | 32 ++++++++++++++++---------------- python/pyarrow/tests/test_fs.py | 16 +++++++++++----- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 5a2d1e09ef2..9f5d6350eaa 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -247,9 +247,6 @@ Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** ou return Status::OK(); } -using TimePoint = - std::chrono::time_point; - Status PyDateTime_from_TimePoint(TimePoint val, PyObject** out) { auto nanos = val.time_since_epoch(); auto micros = std::chrono::duration_cast(nanos); diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 010e08834df..77cdb6e6fa4 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -35,14 +35,14 @@ from pyarrow.lib cimport ( ) -cdef inline c_string _as_posix_path(path) except *: +cdef inline c_string _path_as_bytes(path) except *: """Converts path-like objects to bytestring with posix separators tobytes always uses utf-8, which is more or less ok, at least on Windows since the C++ side then decodes from utf-8. On Unix, os.fsencode may be better. """ - return tobytes(_stringify_path(path).replace('\\', '/')) + return tobytes(_stringify_path(path)) cpdef enum FileType: @@ -159,7 +159,7 @@ cdef class Selector: @base_dir.setter def base_dir(self, base_dir): - self.selector.base_dir = _as_posix_path(base_dir) + self.selector.base_dir = _path_as_bytes(base_dir) @property def allow_non_existent(self): @@ -223,7 +223,7 @@ cdef class FileSystem: selector = (paths_or_selector).selector check_status(self.fs.GetTargetStats(selector, &stats)) elif isinstance(paths_or_selector, (list, tuple)): - paths = [_as_posix_path(s) for s in paths_or_selector] + paths = [_path_as_bytes(s) for s in paths_or_selector] with nogil: check_status(self.fs.GetTargetStats(paths, &stats)) else: @@ -243,7 +243,7 @@ cdef class FileSystem: recursive: bool, default True Create nested directories as well. """ - cdef c_string directory = _as_posix_path(path) + cdef c_string directory = _path_as_bytes(path) with nogil: check_status(self.fs.CreateDir(directory, recursive=recursive)) @@ -255,7 +255,7 @@ cdef class FileSystem: path : str or pathlib.Path The path of the directory to be deleted. """ - cdef c_string directory = _as_posix_path(path) + cdef c_string directory = _path_as_bytes(path) with nogil: check_status(self.fs.DeleteDir(directory)) @@ -275,8 +275,8 @@ cdef class FileSystem: The destination path where the file or directory is moved to. """ cdef: - c_string source = _as_posix_path(src) - c_string destination = _as_posix_path(dest) + c_string source = _path_as_bytes(src) + c_string destination = _path_as_bytes(dest) with nogil: check_status(self.fs.Move(source, destination)) @@ -294,8 +294,8 @@ cdef class FileSystem: The destination path where the file is copied to. """ cdef: - c_string source = _as_posix_path(src) - c_string destination = _as_posix_path(dest) + c_string source = _path_as_bytes(src) + c_string destination = _path_as_bytes(dest) with nogil: check_status(self.fs.CopyFile(source, destination)) @@ -307,7 +307,7 @@ cdef class FileSystem: path : str or pathlib.Path The path of the file to be deleted. """ - cdef c_string file = _as_posix_path(path) + cdef c_string file = _path_as_bytes(path) with nogil: check_status(self.fs.DeleteFile(file)) @@ -342,7 +342,7 @@ cdef class FileSystem: stram : NativeFile """ cdef: - c_string pathstr = _as_posix_path(path) + c_string pathstr = _path_as_bytes(path) NativeFile stream = NativeFile() shared_ptr[CRandomAccessFile] in_handle @@ -375,7 +375,7 @@ cdef class FileSystem: stream : NativeFile """ cdef: - c_string pathstr = _as_posix_path(path) + c_string pathstr = _path_as_bytes(path) NativeFile stream = NativeFile() shared_ptr[CInputStream] in_handle @@ -413,7 +413,7 @@ cdef class FileSystem: stream : NativeFile """ cdef: - c_string pathstr = _as_posix_path(path) + c_string pathstr = _path_as_bytes(path) NativeFile stream = NativeFile() shared_ptr[COutputStream] out_handle @@ -451,7 +451,7 @@ cdef class FileSystem: stream : NativeFile """ cdef: - c_string pathstr = _as_posix_path(path) + c_string pathstr = _path_as_bytes(path) NativeFile stream = NativeFile() shared_ptr[COutputStream] out_handle @@ -505,7 +505,7 @@ cdef class SubTreeFileSystem(FileSystem): c_string pathstr shared_ptr[CSubTreeFileSystem] wrapped - pathstr = _as_posix_path(base_path) + pathstr = _path_as_bytes(base_path) wrapped = make_shared[CSubTreeFileSystem](pathstr, base_fs.wrapped) self.init( wrapped) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 77fc4662b5e..fdd2a8bfbeb 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -45,7 +45,7 @@ def fs(request, tempdir): @pytest.fixture(params=[ - pytest.param(pathlib.Path, id='Path'), + pytest.param(pathlib.PurePosixPath, id='Path'), pytest.param(str, id='str') ]) def testpath(request, fs, tempdir): @@ -53,10 +53,16 @@ def testpath(request, fs, tempdir): # if the filesystem is wrapped in a SubTreeFileSystem then we don't need # to prepend the path with the tempdir, we also test the API with both # pathlib.Path objects and plain python strings - if isinstance(fs, SubTreeFileSystem): - return lambda path: request.param(path) - else: - return lambda path: request.param(tempdir / path) + def convert(path): + if isinstance(fs, SubTreeFileSystem): + path = pathlib.PurePosixPath(path) + else: + path = tempdir / path + # convert to abstract, slash separated paths with as_posix + path = path.as_posix() + # return with the corrent + return request.param(path) + return convert def test_cannot_instantiate_base_filesystem(): From a391016e86db154c4ab509793540147316dbdefb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 13 Sep 2019 00:26:59 +0200 Subject: [PATCH 29/36] move out PyDateTime_IMPORT --- cpp/src/arrow/python/datetime.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 9f5d6350eaa..592fea24393 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -224,7 +224,6 @@ Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) { } Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) { - PyDateTime_IMPORT; int64_t year = 0, month = 0, day = 0; RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day)); *out = PyDate_FromDate(static_cast(year), static_cast(month), @@ -233,7 +232,6 @@ Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) { } Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) { - PyDateTime_IMPORT; int64_t hour = 0, minute = 0, second = 0, microsecond = 0; RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); int64_t total_days = 0; @@ -248,6 +246,7 @@ Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** ou } Status PyDateTime_from_TimePoint(TimePoint val, PyObject** out) { + PyDateTime_IMPORT; auto nanos = val.time_since_epoch(); auto micros = std::chrono::duration_cast(nanos); RETURN_NOT_OK(PyDateTime_from_int(micros.count(), TimeUnit::MICRO, out)); From 9c90de5ed831d41390d8a146c4bdd739d9aaf966 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 13 Sep 2019 12:15:45 +0200 Subject: [PATCH 30/36] use slash delimited paths everywhere in the tests --- python/pyarrow/includes/libarrow.pxd | 3 ++- python/pyarrow/includes/libarrow_fs.pxd | 3 ++- python/pyarrow/tests/test_fs.py | 8 ++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e3a9d199b36..14d672edf89 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1413,7 +1413,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: cdef cppclass CTimePoint "arrow::py::internal::TimePoint": pass - cdef CStatus PyDateTime_from_int(int64_t val, const TimeUnit unit, PyObject** out) + cdef CStatus PyDateTime_from_int(int64_t val, const TimeUnit unit, + PyObject** out) cdef CStatus PyDateTime_from_TimePoint(CTimePoint val, PyObject** out) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index c29db700116..f54a2e50357 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -86,6 +86,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: cdef cppclass CLocalFileSystem "arrow::fs::LocalFileSystem"(CFileSystem): LocalFileSystem() - cdef cppclass CSubTreeFileSystem "arrow::fs::SubTreeFileSystem"(CFileSystem): + cdef cppclass CSubTreeFileSystem \ + "arrow::fs::SubTreeFileSystem"(CFileSystem): CSubTreeFileSystem(const c_string& base_path, shared_ptr[CFileSystem] base_fs) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index fdd2a8bfbeb..b3446b46d77 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -41,7 +41,7 @@ ) ]) def fs(request, tempdir): - return request.param(tempdir) + return request.param(tempdir.as_posix()) @pytest.fixture(params=[ @@ -98,7 +98,7 @@ def mtime_almost_equal(fs_dt, pathlib_ts): aaa_stat, bb_stat, c_stat = fs.get_target_stats([aaa, bb, c]) - assert aaa_stat.path == pathlib.Path(aaa).as_posix() + assert aaa_stat.path == str(aaa) assert aaa_stat.base_name == 'aaa' assert aaa_stat.extension == '' assert aaa_stat.type == FileType.Directory @@ -106,14 +106,14 @@ def mtime_almost_equal(fs_dt, pathlib_ts): with pytest.raises(ValueError): aaa_stat.size - assert bb_stat.path == pathlib.Path(bb).as_posix() + assert bb_stat.path == str(bb) assert bb_stat.base_name == 'bb' assert bb_stat.extension == '' assert bb_stat.type == FileType.File assert bb_stat.size == 0 assert mtime_almost_equal(bb_stat.mtime, bb_.stat().st_mtime) - assert c_stat.path == pathlib.Path(c).as_posix() + assert c_stat.path == str(c) assert c_stat.base_name == 'c.txt' assert c_stat.extension == 'txt' assert c_stat.type == FileType.File From ab803e393c9f6873031d0400d1bfe9c039a3b612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 16 Sep 2019 14:22:46 +0200 Subject: [PATCH 31/36] windows --- python/pyarrow/tests/test_fs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index b3446b46d77..f99cca4af41 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -136,7 +136,7 @@ def walk(directory): base_dir_ = tempdir / base_dir selector = Selector(base_dir, allow_non_existent=False, recursive=True) - assert selector.base_dir == pathlib.Path(base_dir).as_posix() + assert selector.base_dir == str(base_dir) (tempdir / 'test_file').touch() (tempdir / 'test_directory').mkdir() From 58ce6d10867841d0b221eea6713627fa622b7997 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 16 Sep 2019 16:50:02 +0200 Subject: [PATCH 32/36] repr --- python/pyarrow/_fs.pyx | 10 ++++++++++ python/pyarrow/tests/test_fs.py | 14 +++----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 77cdb6e6fa4..b4504ad4c7d 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -66,6 +66,16 @@ cdef class FileStats: self.stats = stats return self + def __repr__(self): + def getvalue(attr): + try: + return getattr(self, attr) + except ValueError: + return '' + attrs = ['type', 'path', 'base_name', 'size', 'extension', 'mtime'] + attrs = '\n'.join('{}: {}'.format(a, getvalue(a)) for a in attrs) + return '{}\n{}'.format(object.__repr__(self), attrs) + @property def type(self): """Type of the file diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index f99cca4af41..ca1a6078d17 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -99,6 +99,7 @@ def mtime_almost_equal(fs_dt, pathlib_ts): aaa_stat, bb_stat, c_stat = fs.get_target_stats([aaa, bb, c]) assert aaa_stat.path == str(aaa) + assert 'aaa' in repr(aaa_stat) assert aaa_stat.base_name == 'aaa' assert aaa_stat.extension == '' assert aaa_stat.type == FileType.Directory @@ -121,17 +122,8 @@ def mtime_almost_equal(fs_dt, pathlib_ts): assert mtime_almost_equal(c_stat.mtime, c_.stat().st_mtime) -@pytest.mark.parametrize('base_dir', ['.', '..']) +@pytest.mark.parametrize('base_dir', ['.']) def test_get_target_stats_with_selector(fs, tempdir, testpath, base_dir): - def walk(directory): - # utility function to recusively list entries under a directory, - # including symlinks and hidden files - for root, dirs, files in os.walk(directory, followlinks=True): - for d in dirs: - yield os.path.join(root, d) - for f in files: - yield os.path.join(root, f) - base_dir = testpath(base_dir) base_dir_ = tempdir / base_dir @@ -142,7 +134,7 @@ def walk(directory): (tempdir / 'test_directory').mkdir() stats = fs.get_target_stats(selector) - expected = list(walk(str(base_dir_))) + expected = list(base_dir_.iterdir()) assert len(stats) == len(expected) for st in stats: From cf066da8c8d161a2f828f7ea61852ec9f48d0671 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 16 Sep 2019 20:20:11 +0200 Subject: [PATCH 33/36] try to fix appveyor test --- python/pyarrow/tests/test_fs.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index ca1a6078d17..d146b140cfc 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -122,10 +122,9 @@ def mtime_almost_equal(fs_dt, pathlib_ts): assert mtime_almost_equal(c_stat.mtime, c_.stat().st_mtime) -@pytest.mark.parametrize('base_dir', ['.']) -def test_get_target_stats_with_selector(fs, tempdir, testpath, base_dir): - base_dir = testpath(base_dir) - base_dir_ = tempdir / base_dir +def test_get_target_stats_with_selector(fs, tempdir, testpath): + base_dir = testpath('.') + base_dir_ = tempdir selector = Selector(base_dir, allow_non_existent=False, recursive=True) assert selector.base_dir == str(base_dir) From ea3a488fe382557eba25ef5c63fc2ec0dd809182 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 17 Sep 2019 12:55:32 +0200 Subject: [PATCH 34/36] flake8 --- python/pyarrow/tests/test_fs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index d146b140cfc..fab935a97b9 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. -import os from datetime import datetime try: import pathlib From a7798f5b507f1b1898c170401334162bfd3a0ddc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 17 Sep 2019 13:41:06 +0200 Subject: [PATCH 35/36] don't accept pathlib.Paths objects --- python/pyarrow/_fs.pyx | 15 ++++++++------- python/pyarrow/tests/test_fs.py | 18 +++++++----------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index b4504ad4c7d..769ca8d7391 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -36,13 +36,14 @@ from pyarrow.lib cimport ( cdef inline c_string _path_as_bytes(path) except *: - """Converts path-like objects to bytestring with posix separators - - tobytes always uses utf-8, which is more or less ok, at least on Windows - since the C++ side then decodes from utf-8. On Unix, os.fsencode may be - better. - """ - return tobytes(_stringify_path(path)) + # handle only abstract paths, not bound to any filesystem like pathlib is, + # so we only accept plain strings + if not isinstance(path, six.string_types): + raise TypeError('Path must be a string') + # tobytes always uses utf-8, which is more or less ok, at least on Windows + # since the C++ side then decodes from utf-8. On Unix, os.fsencode may be + # better. + return tobytes(path) cpdef enum FileType: diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index fab935a97b9..60203916149 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -24,6 +24,7 @@ import pytest from pyarrow import ArrowIOError +from pyarrow.compat import tobytes, unicode_type from pyarrow.fs import (FileType, Selector, FileSystem, LocalFileSystem, SubTreeFileSystem) from pyarrow.tests.test_io import gzip_compress, gzip_decompress @@ -43,10 +44,7 @@ def fs(request, tempdir): return request.param(tempdir.as_posix()) -@pytest.fixture(params=[ - pytest.param(pathlib.PurePosixPath, id='Path'), - pytest.param(str, id='str') -]) +@pytest.fixture def testpath(request, fs, tempdir): # we always use the tempdir for reading and writing test artifacts, but # if the filesystem is wrapped in a SubTreeFileSystem then we don't need @@ -54,13 +52,10 @@ def testpath(request, fs, tempdir): # pathlib.Path objects and plain python strings def convert(path): if isinstance(fs, SubTreeFileSystem): - path = pathlib.PurePosixPath(path) + path = pathlib.Path(path) else: path = tempdir / path - # convert to abstract, slash separated paths with as_posix - path = path.as_posix() - # return with the corrent - return request.param(path) + return path.as_posix() return convert @@ -73,7 +68,8 @@ def test_non_path_like_input_raises(fs): class Path: pass - invalid_paths = [1, 1.1, Path(), tuple(), {}, [], lambda: 1] + invalid_paths = [1, 1.1, Path(), tuple(), {}, [], lambda: 1, + pathlib.Path()] for path in invalid_paths: with pytest.raises(TypeError): fs.create_dir(path) @@ -97,7 +93,7 @@ def mtime_almost_equal(fs_dt, pathlib_ts): aaa_stat, bb_stat, c_stat = fs.get_target_stats([aaa, bb, c]) - assert aaa_stat.path == str(aaa) + assert aaa_stat.path == aaa assert 'aaa' in repr(aaa_stat) assert aaa_stat.base_name == 'aaa' assert aaa_stat.extension == '' From 9b66277d404fd62fb151982353e9ba48c7dcc8d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 17 Sep 2019 13:41:39 +0200 Subject: [PATCH 36/36] flake8 --- python/pyarrow/tests/test_fs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 60203916149..f897e0d36f9 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -24,7 +24,6 @@ import pytest from pyarrow import ArrowIOError -from pyarrow.compat import tobytes, unicode_type from pyarrow.fs import (FileType, Selector, FileSystem, LocalFileSystem, SubTreeFileSystem) from pyarrow.tests.test_io import gzip_compress, gzip_decompress