diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 44afd4ba0bf..3f2d5506317 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -463,6 +463,11 @@ cdef class FunctionRegistry(_Weakrefable): def get_function(self, name): """ Look up a function by name in the registry. + + Parameters + ---------- + name : str + The name of the function to lookup """ cdef: c_string c_name = tobytes(name) @@ -485,6 +490,11 @@ def get_function(name): The function is looked up in the global registry (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to lookup """ return _global_func_registry.get_function(name) @@ -502,6 +512,17 @@ def call_function(name, args, options=None, memory_pool=None): The function is looked up in the global registry (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to call. + args : list + The arguments to the function. + options : optional + options provided to the function. + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. """ func = _global_func_registry.get_function(name) return func.call(args, options=options, memory_pool=memory_pool) @@ -524,6 +545,14 @@ cdef class FunctionOptions(_Weakrefable): @staticmethod def deserialize(buf): + """ + Deserialize options for a function. + + Parameters + ---------- + buf : Buffer + The buffer containing the data to deserialize. + """ cdef: shared_ptr[CBuffer] c_buf = pyarrow_unwrap_buffer(buf) CResult[unique_ptr[CFunctionOptions]] maybe_options = \ @@ -670,6 +699,14 @@ class CastOptions(_CastOptions): @staticmethod def safe(target_type=None): + """" + Create a CastOptions for a safe cast. + + Parameters + ---------- + target_type : optional + Target cast type for the safe cast. + """ self = CastOptions() self._set_safe() self._set_type(target_type) @@ -677,6 +714,14 @@ class CastOptions(_CastOptions): @staticmethod def unsafe(target_type=None): + """" + Create a CastOptions for an unsafe cast. + + Parameters + ---------- + target_type : optional + Target cast type for the unsafe cast. + """ self = CastOptions() self._set_unsafe() self._set_type(target_type) diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index 950e2d5464c..19ade43249c 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -59,10 +59,10 @@ cdef class ReadOptions(_Weakrefable): This will determine multi-threading granularity as well as the size of individual record batches or table chunks. Minimum valid value for block size is 1 - skip_rows: int, optional (default 0) + skip_rows : int, optional (default 0) The number of rows to skip before the column names (if any) and the CSV data. - skip_rows_after_names: int, optional (default 0) + skip_rows_after_names : int, optional (default 0) The number of rows to skip after the column names. This number can be larger than the number of rows in one block, and empty rows are counted. @@ -70,15 +70,15 @@ cdef class ReadOptions(_Weakrefable): - `skip_rows` is applied (if non-zero); - column names aread (unless `column_names` is set); - `skip_rows_after_names` is applied (if non-zero). - column_names: list, optional + column_names : list, optional The column names of the target table. If empty, fall back on `autogenerate_column_names`. - autogenerate_column_names: bool, optional (default False) + autogenerate_column_names : bool, optional (default False) Whether to autogenerate column names if `column_names` is empty. If true, column names will be of the form "f0", "f1"... If false, column names will be read from the first CSV row after `skip_rows`. - encoding: str, optional (default 'utf8') + encoding : str, optional (default 'utf8') The character encoding of the CSV data. Columns that cannot decode using this encoding can still be read as Binary. """ @@ -235,22 +235,22 @@ cdef class ParseOptions(_Weakrefable): Parameters ---------- - delimiter: 1-character string, optional (default ',') + delimiter : 1-character string, optional (default ',') The character delimiting individual cells in the CSV data. - quote_char: 1-character string or False, optional (default '"') + quote_char : 1-character string or False, optional (default '"') The character used optionally for quoting CSV values (False if quoting is not allowed). - double_quote: bool, optional (default True) + double_quote : bool, optional (default True) Whether two quotes in a quoted CSV value denote a single quote in the data. - escape_char: 1-character string or False, optional (default False) + escape_char : 1-character string or False, optional (default False) The character used optionally for escaping special characters (False if escaping is not allowed). - newlines_in_values: bool, optional (default False) + newlines_in_values : bool, optional (default False) Whether newline characters are allowed in CSV values. Setting this to True reduces the performance of multi-threaded CSV reading. - ignore_empty_lines: bool, optional (default True) + ignore_empty_lines : bool, optional (default True) Whether empty lines are ignored in CSV input. If False, an empty line is interpreted as containing a single empty value (assuming a one-column CSV file). @@ -423,53 +423,53 @@ cdef class ConvertOptions(_Weakrefable): ---------- check_utf8 : bool, optional (default True) Whether to check UTF8 validity of string columns. - column_types: pa.Schema or dict, optional + column_types : pa.Schema or dict, optional Explicitly map column names to column types. Passing this argument disables type inference on the defined columns. - null_values: list, optional + null_values : list, optional A sequence of strings that denote nulls in the data (defaults are appropriate in most cases). Note that by default, string columns are not checked for null values. To enable null checking for those, specify ``strings_can_be_null=True``. - true_values: list, optional + true_values : list, optional A sequence of strings that denote true booleans in the data (defaults are appropriate in most cases). - false_values: list, optional + false_values : list, optional A sequence of strings that denote false booleans in the data (defaults are appropriate in most cases). - decimal_point: 1-character string, optional (default '.') + decimal_point : 1-character string, optional (default '.') The character used as decimal point in floating-point and decimal data. - timestamp_parsers: list, optional + timestamp_parsers : list, optional A sequence of strptime()-compatible format strings, tried in order when attempting to infer or convert timestamp values (the special value ISO8601() can also be given). By default, a fast built-in ISO-8601 parser is used. - strings_can_be_null: bool, optional (default False) + strings_can_be_null : bool, optional (default False) Whether string / binary columns can have null values. If true, then strings in null_values are considered null for string columns. If false, then all strings are valid string values. - quoted_strings_can_be_null: bool, optional (default True) + quoted_strings_can_be_null : bool, optional (default True) Whether quoted values can be null. If true, then strings in "null_values" are also considered null when they appear quoted in the CSV file. Otherwise, quoted values are never considered null. - auto_dict_encode: bool, optional (default False) + auto_dict_encode : bool, optional (default False) Whether to try to automatically dict-encode string / binary data. If true, then when type inference detects a string or binary column, it it dict-encoded up to `auto_dict_max_cardinality` distinct values (per chunk), after which it switches to regular encoding. This setting is ignored for non-inferred columns (those in `column_types`). - auto_dict_max_cardinality: int, optional + auto_dict_max_cardinality : int, optional The maximum dictionary cardinality for `auto_dict_encode`. This value is per chunk. - include_columns: list, optional + include_columns : list, optional The names of columns to include in the Table. If empty, the Table will include all columns from the CSV file. If not empty, only these columns will be included, in this order. - include_missing_columns: bool, optional (default False) + include_missing_columns : bool, optional (default False) If false, columns in `include_columns` but not in the CSV file will error out. If true, columns in `include_columns` but not in the CSV file will @@ -848,20 +848,20 @@ def read_csv(input_file, read_options=None, parse_options=None, Parameters ---------- - input_file: string, path or file-like object + input_file : string, path or file-like object The location of CSV data. If a string or path, and if it ends with a recognized compressed file extension (e.g. ".gz" or ".bz2"), the data is automatically decompressed when reading. - read_options: pyarrow.csv.ReadOptions, optional + read_options : pyarrow.csv.ReadOptions, optional Options for the CSV reader (see pyarrow.csv.ReadOptions constructor for defaults) - parse_options: pyarrow.csv.ParseOptions, optional + parse_options : pyarrow.csv.ParseOptions, optional Options for the CSV parser (see pyarrow.csv.ParseOptions constructor for defaults) - convert_options: pyarrow.csv.ConvertOptions, optional + convert_options : pyarrow.csv.ConvertOptions, optional Options for converting CSV data (see pyarrow.csv.ConvertOptions constructor for defaults) - memory_pool: MemoryPool, optional + memory_pool : MemoryPool, optional Pool to allocate Table memory from Returns @@ -906,20 +906,20 @@ def open_csv(input_file, read_options=None, parse_options=None, Parameters ---------- - input_file: string, path or file-like object + input_file : string, path or file-like object The location of CSV data. If a string or path, and if it ends with a recognized compressed file extension (e.g. ".gz" or ".bz2"), the data is automatically decompressed when reading. - read_options: pyarrow.csv.ReadOptions, optional + read_options : pyarrow.csv.ReadOptions, optional Options for the CSV reader (see pyarrow.csv.ReadOptions constructor for defaults) - parse_options: pyarrow.csv.ParseOptions, optional + parse_options : pyarrow.csv.ParseOptions, optional Options for the CSV parser (see pyarrow.csv.ParseOptions constructor for defaults) - convert_options: pyarrow.csv.ConvertOptions, optional + convert_options : pyarrow.csv.ConvertOptions, optional Options for converting CSV data (see pyarrow.csv.ConvertOptions constructor for defaults) - memory_pool: MemoryPool, optional + memory_pool : MemoryPool, optional Pool to allocate Table memory from Returns @@ -1014,13 +1014,13 @@ def write_csv(data, output_file, write_options=None, Parameters ---------- - data: pyarrow.RecordBatch or pyarrow.Table + data : pyarrow.RecordBatch or pyarrow.Table The data to write. - output_file: string, path, pyarrow.NativeFile, or file-like object + output_file : string, path, pyarrow.NativeFile, or file-like object The location where to write the CSV data. - write_options: pyarrow.csv.WriteOptions + write_options : pyarrow.csv.WriteOptions Options to configure writing the CSV data. - memory_pool: MemoryPool, optional + memory_pool : MemoryPool, optional Pool for temporary allocations. """ cdef: @@ -1047,17 +1047,18 @@ def write_csv(data, output_file, write_options=None, cdef class CSVWriter(_CRecordBatchWriter): - """Writer to create a CSV file. + """ + Writer to create a CSV file. Parameters ---------- - sink: string, path, pyarrow.OutputStream or file-like object + sink : str, path, pyarrow.OutputStream or file-like object The location where to write the CSV data. - schema: pyarrow.Schema + schema : pyarrow.Schema The schema of the data to be written. - write_options: pyarrow.csv.WriteOptions + write_options : pyarrow.csv.WriteOptions Options to configure writing the CSV data. - memory_pool: MemoryPool, optional + memory_pool : MemoryPool, optional Pool for temporary allocations. """ diff --git a/python/pyarrow/_cuda.pyx b/python/pyarrow/_cuda.pyx index f4ca7639885..1b66b95089a 100644 --- a/python/pyarrow/_cuda.pyx +++ b/python/pyarrow/_cuda.pyx @@ -187,7 +187,8 @@ cdef class Context(_Weakrefable): return pyarrow_wrap_cudabuffer(cudabuf) def foreign_buffer(self, address, size, base=None): - """Create device buffer from address and size as a view. + """ + Create device buffer from address and size as a view. The caller is responsible for allocating and freeing the memory. When `address==size==0` then a new zero-sized buffer diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 5a906a32ade..586b03aded8 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -508,12 +508,13 @@ cdef class Dataset(_Weakrefable): cdef class InMemoryDataset(Dataset): - """A Dataset wrapping in-memory data. + """ + A Dataset wrapping in-memory data. Parameters ---------- - source - The data for this dataset. Can be a RecordBatch, Table, list of + source : The data for this dataset. + Can be a RecordBatch, Table, list of RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader. If an iterable is provided, the schema must also be provided. schema : Schema, optional @@ -569,7 +570,8 @@ cdef class InMemoryDataset(Dataset): cdef class UnionDataset(Dataset): - """A Dataset wrapping child datasets. + """ + A Dataset wrapping child datasets. Children's schemas must agree with the provided schema. @@ -611,7 +613,8 @@ cdef class UnionDataset(Dataset): cdef class FileSystemDataset(Dataset): - """A Dataset of file fragments. + """ + A Dataset of file fragments. A FileSystemDataset is composed of one or more FileFragment. @@ -1154,7 +1157,15 @@ cdef class FileFragment(Fragment): class RowGroupInfo: - """A wrapper class for RowGroup information""" + """ + A wrapper class for RowGroup information + + Parameters + ---------- + id : the group id. + metadata : the rowgroup metadata. + schema : schema of the rows. + """ def __init__(self, id, metadata, schema): self.id = id @@ -1534,6 +1545,18 @@ cdef set _PARQUET_READ_OPTIONS = { cdef class ParquetFileFormat(FileFormat): + """ + FileFormat for Parquet + + Parameters + ---------- + read_options : ParquetReadOptions + Read options for the file. + default_fragment_scan_options : ParquetFragmentScanOptions + Scan Options for the file. + **kwargs : dict + Additional options for read option or scan option. + """ cdef: CParquetFileFormat* parquet_format @@ -1668,7 +1691,8 @@ cdef class ParquetFileFormat(FileFormat): cdef class ParquetFragmentScanOptions(FragmentScanOptions): - """Scan-specific options for Parquet fragments. + """ + Scan-specific options for Parquet fragments. Parameters ---------- @@ -1795,6 +1819,20 @@ cdef class IpcFileFormat(FileFormat): cdef class CsvFileFormat(FileFormat): + """ + FileFormat for CSV files. + + Parameters + ---------- + parse_options : ParseOptions + Options regarding CSV parsing. + convert_options : ConvertOptions + Options regarding value conversion. + read_options : ReadOptions + General read options. + default_fragment_scan_options : CsvFragmentScanOptions + Default options for fragments scan. + """ cdef: CCsvFileFormat* csv_format @@ -1864,7 +1902,16 @@ cdef class CsvFileFormat(FileFormat): cdef class CsvFragmentScanOptions(FragmentScanOptions): - """Scan-specific options for CSV fragments.""" + """ + Scan-specific options for CSV fragments. + + Parameters + ---------- + convert_options : ConvertOptions + Options regarding value conversion. + read_options : ReadOptions + General read options. + """ cdef: CCsvFragmentScanOptions* csv_options @@ -2329,7 +2376,7 @@ cdef class DatasetFactory(_Weakrefable): shared_ptr[CDatasetFactory] wrapped CDatasetFactory* factory - def __init__(self, list children): + def __init__(self): _forbid_instantiation(self.__class__) cdef init(self, const shared_ptr[CDatasetFactory]& sp): @@ -2387,7 +2434,7 @@ cdef class DatasetFactory(_Weakrefable): Parameters ---------- - schema: Schema, default None + schema : Schema, default None The schema to conform the source to. If None, the inspected schema is used. @@ -2422,7 +2469,7 @@ cdef class FileSystemFactoryOptions(_Weakrefable): partition_base_dir prefix will be skipped for partitioning discovery. The ignored files will still be part of the Dataset, but will not have partition information. - partitioning: Partitioning/PartitioningFactory, optional + partitioning : Partitioning/PartitioningFactory, optional Apply the Partitioning to every discovered Fragment. See Partitioning or PartitioningFactory documentation. exclude_invalid_files : bool, optional (default True) @@ -2532,7 +2579,7 @@ cdef class FileSystemDatasetFactory(DatasetFactory): ---------- filesystem : pyarrow.fs.FileSystem Filesystem to discover. - paths_or_selector: pyarrow.fs.Selector or list of path-likes + paths_or_selector : pyarrow.fs.Selector or list of path-likes Either a Selector object or a list of path-like objects. format : FileFormat Currently only ParquetFileFormat and IpcFileFormat are supported. @@ -2791,7 +2838,14 @@ cdef class RecordBatchIterator(_Weakrefable): class TaggedRecordBatch(collections.namedtuple( "TaggedRecordBatch", ["record_batch", "fragment"])): - """A combination of a record batch and the fragment it came from.""" + """ + A combination of a record batch and the fragment it came from. + + Parameters + ---------- + record_batch : The record batch. + fragment : fragment of the record batch. + """ cdef class TaggedRecordBatchIterator(_Weakrefable): @@ -2945,6 +2999,33 @@ cdef class Scanner(_Weakrefable): object columns=None, Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE, FragmentScanOptions fragment_scan_options=None): + """ + Create Scanner from Dataset, + refer to Scanner class doc for additional details on Scanner. + + Parameters + ---------- + dataset : Dataset + Dataset to scan. + columns : list of str or dict, default None + The columns to project. + filter : Expression, default None + Scan will return only the rows matching the filter. + batch_size : int, default 1M + The maximum row count for scanned record batches. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + use_async : bool, default False + If enabled, an async scanner will be used that should offer + better performance with high-latency/highly-parallel filesystems + (e.g. S3) + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + fragment_scan_options : FragmentScanOptions + The fragment scan options. + """ cdef: shared_ptr[CScanOptions] options = make_shared[CScanOptions]() shared_ptr[CScannerBuilder] builder @@ -2966,6 +3047,35 @@ cdef class Scanner(_Weakrefable): object columns=None, Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE, FragmentScanOptions fragment_scan_options=None): + """ + Create Scanner from Fragment, + refer to Scanner class doc for additional details on Scanner. + + Parameters + ---------- + fragment : Fragment + fragment to scan. + schema : Schema + The schema of the fragment. + columns : list of str or dict, default None + The columns to project. + filter : Expression, default None + Scan will return only the rows matching the filter. + batch_size : int, default 1M + The maximum row count for scanned record batches. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + use_async : bool, default False + If enabled, an async scanner will be used that should offer + better performance with high-latency/highly-parallel filesystems + (e.g. S3) + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + fragment_scan_options : FragmentScanOptions + The fragment scan options. + """ cdef: shared_ptr[CScanOptions] options = make_shared[CScanOptions]() shared_ptr[CScannerBuilder] builder @@ -2990,12 +3100,38 @@ cdef class Scanner(_Weakrefable): Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE, FragmentScanOptions fragment_scan_options=None): - """Create a Scanner from an iterator of batches. + """ + Create a Scanner from an iterator of batches. This creates a scanner which can be used only once. It is intended to support writing a dataset (which takes a scanner) from a source which can be read only once (e.g. a RecordBatchReader or generator). + + Parameters + ---------- + source : Iterator + The iterator of Batches. + schema : Schema + The schema of the batches. + columns : list of str or dict, default None + The columns to project. + filter : Expression, default None + Scan will return only the rows matching the filter. + batch_size : int, default 1M + The maximum row count for scanned record batches. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + use_async : bool, default False + If enabled, an async scanner will be used that should offer + better performance with high-latency/highly-parallel filesystems + (e.g. S3) + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + fragment_scan_options : FragmentScanOptions + The fragment scan options. """ cdef: shared_ptr[CScanOptions] options = make_shared[CScanOptions]() diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index bb639c09719..a309a42ec86 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -597,15 +597,15 @@ cdef class FileSystem(_Weakrefable): Parameters ---------- - source: str + source : str The source to open for reading. - compression: str optional, default 'detect' + compression : str optional, default 'detect' The compression algorithm to use for on-the-fly decompression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: int optional, default None + buffer_size : int optional, default None If None or 0, no buffering will happen. Otherwise the size of the temporary read buffer. @@ -639,16 +639,16 @@ cdef class FileSystem(_Weakrefable): ---------- path : str The source to open for writing. - compression: str optional, default 'detect' + compression : str optional, default 'detect' The compression algorithm to use for on-the-fly compression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: int optional, default None + buffer_size : int optional, default None If None or 0, no buffering will happen. Otherwise the size of the temporary write buffer. - metadata: dict optional, default None + metadata : dict optional, default None If not None, a mapping of string keys to string values. Some filesystems support storing metadata along the file (such as "Content-Type"). @@ -693,16 +693,16 @@ cdef class FileSystem(_Weakrefable): ---------- path : str The source to open for writing. - compression: str optional, default 'detect' + compression : str optional, default 'detect' The compression algorithm to use for on-the-fly compression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: int optional, default None + buffer_size : int optional, default None If None or 0, no buffering will happen. Otherwise the size of the temporary write buffer. - metadata: dict optional, default None + metadata : dict optional, default None If not None, a mapping of string keys to string values. Some filesystems support storing metadata along the file (such as "Content-Type"). @@ -768,7 +768,7 @@ cdef class LocalFileSystem(FileSystem): Parameters ---------- - use_mmap: bool, default False + use_mmap : bool, default False Whether open_input_stream and open_input_file should return a mmap'ed file or a regular file. """ @@ -813,9 +813,9 @@ cdef class SubTreeFileSystem(FileSystem): Parameters ---------- - base_path: str + base_path : str The root of the subtree. - base_fs: FileSystem + base_fs : FileSystem FileSystem object the operations delegated to. """ @@ -939,30 +939,51 @@ class FileSystemHandler(ABC): def get_file_info(self, paths): """ Implement PyFileSystem.get_file_info(paths). + + Parameters + ---------- + paths : paths for which we want to retrieve the info. """ @abstractmethod def get_file_info_selector(self, selector): """ Implement PyFileSystem.get_file_info(selector). + + Parameters + ---------- + selector : selector for which we want to retrieve the info. """ @abstractmethod def create_dir(self, path, recursive): """ Implement PyFileSystem.create_dir(...). + + Parameters + ---------- + path : path of the directory. + recursive : if the parent directories should be created too. """ @abstractmethod def delete_dir(self, path): """ Implement PyFileSystem.delete_dir(...). + + Parameters + ---------- + path : path of the directory. """ @abstractmethod def delete_dir_contents(self, path): """ Implement PyFileSystem.delete_dir_contents(...). + + Parameters + ---------- + path : path of the directory. """ @abstractmethod @@ -975,53 +996,93 @@ class FileSystemHandler(ABC): def delete_file(self, path): """ Implement PyFileSystem.delete_file(...). + + Parameters + ---------- + path : path of the file. """ @abstractmethod def move(self, src, dest): """ Implement PyFileSystem.move(...). + + Parameters + ---------- + src : path of what should be moved. + dest : path of where it should be moved to. """ @abstractmethod def copy_file(self, src, dest): """ Implement PyFileSystem.copy_file(...). + + Parameters + ---------- + src : path of what should be copied. + dest : path of where it should be copied to. """ @abstractmethod def open_input_stream(self, path): """ Implement PyFileSystem.open_input_stream(...). + + Parameters + ---------- + path : path of what should be opened. """ @abstractmethod def open_input_file(self, path): """ Implement PyFileSystem.open_input_file(...). + + Parameters + ---------- + path : path of what should be opened. """ @abstractmethod def open_output_stream(self, path, metadata): """ Implement PyFileSystem.open_output_stream(...). + + Parameters + ---------- + path : path of what should be opened. + metadata : mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). """ @abstractmethod def open_append_stream(self, path, metadata): """ Implement PyFileSystem.open_append_stream(...). + + Parameters + ---------- + path : path of what should be opened. + metadata : mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). """ @abstractmethod def normalize_path(self, path): """ Implement PyFileSystem.normalize_path(...). - """ + Parameters + ---------- + path : path of what should be normalized. + """ # Callback definitions for CPyFileSystemVtable + cdef void _cb_get_type_name(handler, c_string* out) except *: out[0] = tobytes("py::" + handler.get_type_name()) diff --git a/python/pyarrow/_json.pyx b/python/pyarrow/_json.pyx index 183bd4fdd4d..1c08e546ec9 100644 --- a/python/pyarrow/_json.pyx +++ b/python/pyarrow/_json.pyx @@ -86,12 +86,12 @@ cdef class ParseOptions(_Weakrefable): Parameters ---------- - explicit_schema: Schema, optional (default None) + explicit_schema : Schema, optional (default None) Optional explicit schema (no type inference, ignores other fields). - newlines_in_values: bool, optional (default False) + newlines_in_values : bool, optional (default False) Whether objects may be printed across multiple lines (for example pretty printed). If false, input must end with an empty line. - unexpected_field_behavior: str, default "infer" + unexpected_field_behavior : str, default "infer" How JSON fields outside of explicit_schema (if given) are treated. Possible behaviors: @@ -211,16 +211,16 @@ def read_json(input_file, read_options=None, parse_options=None, Parameters ---------- - input_file: string, path or file-like object + input_file : str, path or file-like object The location of JSON data. Currently only the line-delimited JSON format is supported. - read_options: pyarrow.json.ReadOptions, optional - Options for the JSON reader (see ReadOptions constructor for defaults) - parse_options: pyarrow.json.ParseOptions, optional + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see ReadOptions constructor for defaults). + parse_options : pyarrow.json.ParseOptions, optional Options for the JSON parser - (see ParseOptions constructor for defaults) - memory_pool: MemoryPool, optional - Pool to allocate Table memory from + (see ParseOptions constructor for defaults). + memory_pool : MemoryPool, optional + Pool to allocate Table memory from. Returns ------- diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index a45be28d726..5829d74d31f 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -37,6 +37,14 @@ cpdef enum S3LogLevel: def initialize_s3(S3LogLevel log_level=S3LogLevel.Fatal): + """ + Initialize S3 support + + Parameters + ---------- + log_level : S3LogLevel + level of logging + """ cdef CS3GlobalOptions options options.log_level = log_level check_status(CInitializeS3(options)) @@ -47,7 +55,8 @@ def finalize_s3(): cdef class S3FileSystem(FileSystem): - """S3-backed FileSystem implementation + """ + S3-backed FileSystem implementation If neither access_key nor secret_key are provided, and role_arn is also not provided, then attempts to initialize from AWS environment variables, @@ -62,43 +71,43 @@ cdef class S3FileSystem(FileSystem): Parameters ---------- - access_key: str, default None + access_key : str, default None AWS Access Key ID. Pass None to use the standard AWS environment variables and/or configuration file. - secret_key: str, default None + secret_key : str, default None AWS Secret Access key. Pass None to use the standard AWS environment variables and/or configuration file. - session_token: str, default None + session_token : str, default None AWS Session Token. An optional session token, required if access_key and secret_key are temporary credentials from STS. - anonymous: boolean, default False + anonymous : boolean, default False Whether to connect anonymously if access_key and secret_key are None. If true, will not attempt to look up credentials using standard AWS configuration methods. - role_arn: str, default None + role_arn : str, default None AWS Role ARN. If provided instead of access_key and secret_key, temporary credentials will be fetched by assuming this role. - session_name: str, default None + session_name : str, default None An optional identifier for the assumed role session. - external_id: str, default None + external_id : str, default None An optional unique identifier that might be required when you assume a role in another account. - load_frequency: int, default 900 + load_frequency : int, default 900 The frequency (in seconds) with which temporary credentials from an assumed role session will be refreshed. - region: str, default 'us-east-1' + region : str, default 'us-east-1' AWS region to connect to. - scheme: str, default 'https' + scheme : str, default 'https' S3 connection transport scheme. - endpoint_override: str, default None + endpoint_override : str, default None Override region with a connect string such as "localhost:9000" - background_writes: boolean, default True + background_writes : boolean, default True Whether file writes will be issued in the background, without blocking. - default_metadata: mapping or KeyValueMetadata, default None + default_metadata : mapping or KeyValueMetadata, default None Default metadata for open_output_stream. This will be ignored if non-empty metadata is passed to open_output_stream. - proxy_options: dict or str, default None + proxy_options : dict or str, default None If a proxy is used, provide the options here. Supported options are: 'scheme' (str: 'http' or 'https'; required), 'host' (str; required), 'port' (int; required), 'username' (str; optional), diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 6d5bee6584c..c9a4f3efb5e 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -388,7 +388,7 @@ def repeat(value, size, MemoryPool memory_pool=None): Parameters ---------- - value: Scalar-like object + value : Scalar-like object Either a pyarrow.Scalar or any python object coercible to a Scalar. size : int Number of times to repeat the scalar in the output Array. @@ -878,7 +878,7 @@ cdef class Array(_PandasConvertible): Parameters ---------- - sequence : ndarray, pandas.Series, array-like + obj : ndarray, pandas.Series, array-like mask : array (boolean), optional Indicate which values are null (True) or not null (False). type : pyarrow.DataType @@ -1618,6 +1618,7 @@ cdef class ListArray(BaseListArray): ---------- offsets : Array (int32 type) values : Array (any type) + pool : MemoryPool Returns ------- @@ -1699,6 +1700,7 @@ cdef class LargeListArray(BaseListArray): ---------- offsets : Array (int64 type) values : Array (any type) + pool : MemoryPool Returns ------- @@ -1748,6 +1750,7 @@ cdef class MapArray(Array): offsets : array-like or sequence (int32 type) keys : array-like or sequence (any type) items : array-like or sequence (any type) + pool : MemoryPool Returns ------- @@ -2322,9 +2325,9 @@ cdef class ExtensionArray(Array): Parameters ---------- - typ: DataType + typ : DataType The extension type for the result array. - storage: Array + storage : Array The underlying storage for the result array. Returns diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 8640ea683da..80c6614fc9c 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -132,11 +132,15 @@ def _decorate_compute_function(wrapper, exposed_name, func, option_class): if option_class is not None: doc_pieces.append("""\ options : pyarrow.compute.{0}, optional - Parameters altering compute function semantics - **kwargs : optional - Parameters for {0} constructor. Either `options` - or `**kwargs` can be passed, but not both at the same time. + Parameters altering compute function semantics. """.format(option_class.__name__)) + options_sig = inspect.signature(option_class) + for p in options_sig.parameters.values(): + doc_pieces.append("""\ + {0} : optional + Parameter for {1} constructor. Either `options` + or `{0}` can be passed, but not both at the same time. + """.format(p.name, option_class.__name__)) wrapper.__doc__ = "".join(dedent(s) for s in doc_pieces) return wrapper @@ -630,10 +634,10 @@ def fill_null(values, fill_value): Parameters ---------- - data : Array, ChunkedArray, or Scalar-like object + values : Array, ChunkedArray, or Scalar-like object Each null element is replaced with the corresponding value from fill_value. - fill_value: Array, ChunkedArray, or Scalar-like object + fill_value : Array, ChunkedArray, or Scalar-like object If not same type as data will attempt to cast. Returns diff --git a/python/pyarrow/error.pxi b/python/pyarrow/error.pxi index 882427f32ea..0280016288a 100644 --- a/python/pyarrow/error.pxi +++ b/python/pyarrow/error.pxi @@ -163,7 +163,7 @@ def enable_signal_handlers(c_bool enable): Parameters ---------- - enable: bool + enable : bool Whether to enable user interruption by setting a temporary signal handler. """ diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 1e6875ac08e..225992dc514 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -205,7 +205,7 @@ def read_feather(source, columns=None, use_threads=True, memory_map=True): columns : sequence, optional Only read a specific set of columns. If not provided, all columns are read. - use_threads: bool, default True + use_threads : bool, default True Whether to parallelize reading using multiple threads. memory_map : boolean, default True Use memory mapping when opening file on disk diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 80ee536d44c..5d33268618f 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -256,6 +256,12 @@ class FSSpecHandler(FileSystemHandler): https://filesystem-spec.readthedocs.io/en/latest/index.html + Parameters + ---------- + fs : The FSSpec-compliant filesystem instance. + + Examples + -------- >>> PyFileSystem(FSSpecHandler(fsspec_fs)) """ diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 7d7cb1afb00..7358cb7026f 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -759,6 +759,16 @@ cdef class MemoryMappedFile(NativeFile): @staticmethod def create(path, size): + """ + Create a MemoryMappedFile + + Parameters + ---------- + path : str + Where to create the file. + size : int + Size of the memory mapped file. + """ cdef: shared_ptr[CMemoryMappedFile] handle c_string c_path = encode_file_path(path) @@ -1307,6 +1317,20 @@ ctypedef CRandomAccessFile* _RandomAccessFilePtr cdef class BufferedInputStream(NativeFile): + """ + An input stream that performs buffered reads from + an unbuffered input stream, which can mitigate the overhead + of many small reads in some cases. + + Parameters + ---------- + stream : NativeFile + The input stream to wrap with the buffer + buffer_size : int + Size of the temporary read buffer. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ def __init__(self, NativeFile stream, int buffer_size, MemoryPool memory_pool=None): @@ -1357,6 +1381,20 @@ cdef class BufferedInputStream(NativeFile): cdef class BufferedOutputStream(NativeFile): + """ + An output stream that performs buffered reads from + an unbuffered output stream, which can mitigate the overhead + of many small writes in some cases. + + Parameters + ---------- + stream : NativeFile + The writable output stream to wrap with the buffer + buffer_size : int + Size of the buffer that should be added. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ def __init__(self, NativeFile stream, int buffer_size, MemoryPool memory_pool=None): @@ -1406,6 +1444,16 @@ cdef void _cb_transform(transform_func, const shared_ptr[CBuffer]& src, cdef class TransformInputStream(NativeFile): + """ + Transform an input stream. + + Parameters + ---------- + stream : NativeFile + The stream to transform. + transform_func : callable + The transformation to apply. + """ def __init__(self, NativeFile stream, transform_func): self.set_input_stream(TransformInputStream.make_native( @@ -1436,6 +1484,20 @@ class Transcoder: def transcoding_input_stream(stream, src_encoding, dest_encoding): + """ + Add a transcoding transformation to the stream. + Incoming data will be decoded according to ``src_encoding`` and + then re-encoded according to ``dest_encoding``. + + Parameters + ---------- + stream : NativeFile + The stream to which the transformation should be applied. + src_encoding : str + The codec to use when reading data data. + dest_encoding : str + The codec to use for emitted data. + """ src_codec = codecs.lookup(src_encoding) dest_codec = codecs.lookup(dest_encoding) if src_codec.name == dest_codec.name: @@ -1464,6 +1526,11 @@ cdef shared_ptr[CInputStream] native_transcoding_input_stream( def py_buffer(object obj): """ Construct an Arrow buffer from a Python bytes-like or buffer-like object + + Parameters + ---------- + obj : object + the object from which the buffer should be constructed. """ cdef shared_ptr[CBuffer] buf buf = GetResultValue(PyBuffer.FromPyObject(obj)) @@ -1478,6 +1545,18 @@ def foreign_buffer(address, size, base=None): The *base* object will be kept alive as long as this buffer is alive, including across language boundaries (for example if the buffer is referenced by C++ code). + + Parameters + ---------- + address : int + The starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + The size of device buffer in bytes. + base : {None, object} + Object that owns the referenced memory. """ cdef: intptr_t c_addr = address @@ -1625,7 +1704,7 @@ cdef class Codec(_Weakrefable): Type of compression codec to initialize, valid values are: 'gzip', 'bz2', 'brotli', 'lz4' (or 'lz4_frame'), 'lz4_raw', 'zstd' and 'snappy'. - compression_level: int, None + compression_level : int, None Optional parameter specifying how aggressively to compress. The possible ranges and effect of this parameter depend on the specific codec chosen. Higher values compress more but typically use more @@ -1708,9 +1787,9 @@ cdef class Codec(_Weakrefable): Parameters ---------- - compression: str - Type of compression codec, valid values are: gzip, bz2, brotli, - lz4, zstd and snappy. + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. Returns ------- @@ -1724,6 +1803,12 @@ cdef class Codec(_Weakrefable): """ Returns true if the compression level parameter is supported for the given codec. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. """ cdef CCompressionType typ = _ensure_compression(compression) return CCodec.SupportsCompressionLevel(typ) @@ -1733,6 +1818,12 @@ cdef class Codec(_Weakrefable): """ Returns the compression level that Arrow will use for the codec if None is specified. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. """ cdef CCompressionType typ = _ensure_compression(compression) return GetResultValue(CCodec.DefaultCompressionLevel(typ)) @@ -1741,6 +1832,12 @@ cdef class Codec(_Weakrefable): def minimum_compression_level(str compression not None): """ Returns the smallest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. """ cdef CCompressionType typ = _ensure_compression(compression) return GetResultValue(CCodec.MinimumCompressionLevel(typ)) @@ -1749,6 +1846,12 @@ cdef class Codec(_Weakrefable): def maximum_compression_level(str compression not None): """ Returns the largest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. """ cdef CCompressionType typ = _ensure_compression(compression) return GetResultValue(CCodec.MaximumCompressionLevel(typ)) @@ -1936,15 +2039,15 @@ def input_stream(source, compression='detect', buffer_size=None): Parameters ---------- - source: str, Path, buffer, file-like object, ... + source : str, Path, buffer, file-like object, ... The source to open for reading. - compression: str optional, default 'detect' + compression : str optional, default 'detect' The compression algorithm to use for on-the-fly decompression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: int, default None + buffer_size : int, default None If None or 0, no buffering will happen. Otherwise the size of the temporary read buffer. """ @@ -1988,15 +2091,15 @@ def output_stream(source, compression='detect', buffer_size=None): Parameters ---------- - source: str, Path, buffer, file-like object, ... + source : str, Path, buffer, file-like object, ... The source to open for writing. - compression: str optional, default 'detect' + compression : str optional, default 'detect' The compression algorithm to use for on-the-fly compression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: int, default None + buffer_size : int, default None If None or 0, no buffering will happen. Otherwise the size of the temporary write buffer. """ diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 4b22acc076f..9304bbb9781 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -54,6 +54,14 @@ _WriteStats = namedtuple( class WriteStats(_WriteStats): """IPC write statistics + + Parameters + ---------- + num_messages : number of messages. + num_record_batches : number of record batches. + num_dictionary_batches : number of dictionary batches. + num_dictionary_deltas : delta of dictionaries. + num_replaced_dictionaries : number of replaced dictionaries. """ __slots__ = () @@ -73,6 +81,14 @@ _ReadStats = namedtuple( class ReadStats(_ReadStats): """IPC read statistics + + Parameters + ---------- + num_messages : number of messages. + num_record_batches : number of record batches. + num_dictionary_batches : number of dictionary batches. + num_dictionary_deltas : delta of dictionaries. + num_replaced_dictionaries : number of replaced dictionaries. """ __slots__ = () @@ -85,26 +101,27 @@ cdef _wrap_read_stats(CIpcReadStats c): cdef class IpcWriteOptions(_Weakrefable): - """Serialization options for the IPC format. + """ + Serialization options for the IPC format. Parameters ---------- metadata_version : MetadataVersion, default MetadataVersion.V5 The metadata version to write. V5 is the current and latest, V4 is the pre-1.0 metadata version (with incompatible Union layout). - allow_64bit: bool, default False + allow_64bit : bool, default False If true, allow field lengths that don't fit in a signed 32-bit int. use_legacy_format : bool, default False Whether to use the pre-Arrow 0.15 IPC format. - compression: str, Codec, or None + compression : str, Codec, or None compression codec to use for record batch buffers. If None then batch buffers will be uncompressed. Must be "lz4", "zstd" or None. To specify a compression_level use `pyarrow.Codec` - use_threads: bool + use_threads : bool Whether to use the global CPU thread pool to parallelize any computational tasks like compression. - emit_dictionary_deltas: bool + emit_dictionary_deltas : bool Whether to emit dictionary deltas. Default is false for maximum stream compatibility. """ @@ -310,6 +327,13 @@ cdef class MessageReader(_Weakrefable): @staticmethod def open_stream(source): + """ + Open stream from source. + + Parameters + ---------- + source : a readable source, like an InputStream + """ cdef: MessageReader result = MessageReader.__new__(MessageReader) shared_ptr[CInputStream] in_stream @@ -781,6 +805,11 @@ cdef class _RecordBatchFileReader(_Weakrefable): def get_tensor_size(Tensor tensor): """ Return total size of serialized Tensor including metadata and padding. + + Parameters + ---------- + tensor : Tensor + The tensor for which we want to known the size. """ cdef int64_t size with nogil: @@ -791,6 +820,11 @@ def get_tensor_size(Tensor tensor): def get_record_batch_size(RecordBatch batch): """ Return total size of serialized RecordBatch including metadata and padding. + + Parameters + ---------- + batch : RecordBatch + The recordbatch for which we want to know the size. """ cdef int64_t size with nogil: diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py index 049d0c95c4b..cb28a0b5fd4 100644 --- a/python/pyarrow/ipc.py +++ b/python/pyarrow/ipc.py @@ -220,7 +220,7 @@ def deserialize_pandas(buf, *, use_threads=True): ---------- buf : buffer An object compatible with the buffer protocol. - use_threads: bool, default True + use_threads : bool, default True Whether to parallelize the conversion using multiple threads. Returns diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 60e1f8c53bb..0d86df60136 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -60,6 +60,11 @@ def set_cpu_count(int count): """ Set the number of threads to use in parallel operations. + Parameters + ---------- + count : int + The number of concurrent threads that should be used. + See Also -------- cpu_count : Get the size of this pool. diff --git a/python/pyarrow/memory.pxi b/python/pyarrow/memory.pxi index fc0d32aad56..8ccb3505842 100644 --- a/python/pyarrow/memory.pxi +++ b/python/pyarrow/memory.pxi @@ -123,6 +123,11 @@ def proxy_memory_pool(MemoryPool parent): """ Create and return a MemoryPool instance that redirects to the *parent*, but with separate allocation statistics. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. """ cdef ProxyMemoryPool out = ProxyMemoryPool.__new__(ProxyMemoryPool) out.proxy_pool.reset(new CProxyMemoryPool(parent.pool)) @@ -134,6 +139,11 @@ def logging_memory_pool(MemoryPool parent): """ Create and return a MemoryPool instance that redirects to the *parent*, but also dumps allocation logs on stderr. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. """ cdef LoggingMemoryPool out = LoggingMemoryPool.__new__( LoggingMemoryPool, parent) @@ -181,6 +191,14 @@ def mimalloc_memory_pool(): def set_memory_pool(MemoryPool pool): + """ + Set the default memory pool. + + Parameters + ---------- + pool : MemoryPool + The memory pool that should be used by default. + """ c_set_default_memory_pool(pool.pool) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 78128dbf2b9..ad4d876b00e 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -214,6 +214,8 @@ class ParquetFile: Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3). If True, Arrow will use a background I/O thread pool. + read_dictionary : list + List of column names to read directly as DictionaryArray. coerce_int96_timestamp_unit : str, default None. Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' @@ -283,7 +285,9 @@ def read_row_group(self, i, columns=None, use_threads=True, Parameters ---------- - columns: list + i : int + Index of the individual row group that we want to read. + columns : list If not None, only these columns will be read from the row group. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. @@ -310,9 +314,9 @@ def read_row_groups(self, row_groups, columns=None, use_threads=True, Parameters ---------- - row_groups: list + row_groups : list Only these row groups will be read from the file. - columns: list + columns : list If not None, only these columns will be read from the row group. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. @@ -340,12 +344,12 @@ def iter_batches(self, batch_size=65536, row_groups=None, columns=None, Parameters ---------- - batch_size: int, default 64K + batch_size : int, default 64K Maximum number of records to yield per batch. Batches may be smaller if there aren't enough rows in the file. - row_groups: list + row_groups : list Only these row groups will be read from the file. - columns: list + columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. @@ -377,7 +381,7 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False): Parameters ---------- - columns: list + columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. @@ -542,7 +546,7 @@ def _sanitize_table(table, new_schema, flavor): filesystem : FileSystem, default None If nothing passed, will be inferred from `where` if path-like, else `where` is already a file-like object so no filesystem is needed. -compression_level: int or dict, default None +compression_level : int or dict, default None Specify the compression level for a codec, either on a general basis or per-column. If None is passed, arrow selects the compression level for the compression codec in use. The compression level has a different @@ -550,7 +554,7 @@ def _sanitize_table(table, new_schema, flavor): codec you are using. An exception is thrown if the compression codec does not allow specifying a compression level. -use_byte_stream_split: bool or list, default False +use_byte_stream_split : bool or list, default False Specify if the byte_stream_split encoding should be used in general or only for some columns. If both dictionary and byte_stream_stream are enabled, then dictionary is preferred. @@ -560,7 +564,7 @@ def _sanitize_table(table, new_schema, flavor): The serialized Parquet data page format version to write, defaults to 1.0. This does not impact the file schema logical types and Arrow to Parquet type casting behavior; for that use the "version" option. -use_compliant_nested_type: bool, default False +use_compliant_nested_type : bool, default False Whether to write compliant Parquet nested type (lists) as defined `here `_, defaults to ``False``. @@ -597,6 +601,7 @@ class ParquetWriter: where : path or file-like object schema : arrow Schema {} +writer_engine_version : unused **options : dict If options contains a key `metadata_collector` then the corresponding value is assumed to be a list (or any object with @@ -738,6 +743,8 @@ class ParquetDatasetPiece: Two-element tuples of ``(column name, ordinal index)``. row_group : int, default None Row group to load. By default, reads all row groups. + file_options : dict + Options """ def __init__(self, path, open_file_func=partial(open, mode='rb'), @@ -826,6 +833,8 @@ def read(self, columns=None, use_threads=True, partitions=None, partitions : ParquetPartitions, default None file : file-like object Passed to ParquetFile. + use_pandas_metadata : bool + If pandas metadata should be used or not. Returns ------- @@ -892,6 +901,13 @@ class PartitionSet: Then we have two partition sets, one for foo, another for bar. As we visit levels of the partition hierarchy, a PartitionSet tracks the distinct values and assigns categorical codes to use when reading the pieces + + Parameters + ---------- + name : str + Name of the partition set. Under which key to collect all values. + keys : list + All possible values that have been collected for that partition set. """ def __init__(self, name, keys=None): @@ -904,6 +920,10 @@ def get_index(self, key): """ Get the index of the partition value if it is known, otherwise assign one + + Parameters + ---------- + key : The value for which we want to known the index. """ if key in self.key_indices: return self.key_indices[key] @@ -1248,7 +1268,7 @@ class ParquetDataset: and different partitioning schemes are supported. {1} -metadata_nthreads: int, default 1 +metadata_nthreads : int, default 1 How many threads to allow the thread pool which is used to read the dataset metadata. Increasing this is helpful to read partitioned datasets. @@ -1463,6 +1483,11 @@ def read_pandas(self, **kwargs): Read dataset including pandas metadata, if any. Other arguments passed through to ParquetDataset.read, see docstring for further details. + Parameters + ---------- + **kwargs : optional + All additional options to pass to the reader. + Returns ------- pyarrow.Table @@ -1792,11 +1817,11 @@ def filesystem(self): Parameters ---------- -source: str, pyarrow.NativeFile, or file-like object +source : str, pyarrow.NativeFile, or file-like object If a string passed, can be a single file name or directory name. For file-like objects, only read a single file. Use pyarrow.BufferReader to read a file contained in a bytes or buffer-like object. -columns: list +columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. @@ -1837,6 +1862,11 @@ def filesystem(self): use_legacy_dataset=False. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. +coerce_int96_timestamp_unit : str, default None. + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be infered as timestamps + in nanoseconds. Returns ------- @@ -1947,7 +1977,8 @@ def read_pandas(source, columns=None, **kwargs): read_pandas.__doc__ = _read_table_docstring.format( 'Read a Table from Parquet format, also reading DataFrame\n' 'index values if known in the file metadata', - _read_docstring_common, + "\n".join((_read_docstring_common, + """**kwargs : additional options for :func:`read_table`""")), """pyarrow.Table Content of the file as a Table of Columns, including DataFrame indexes as columns""", @@ -2003,10 +2034,12 @@ def write_table(table, where, row_group_size=None, version='1.0', Parameters ---------- table : pyarrow.Table -where: string or pyarrow.NativeFile -row_group_size: int +where : string or pyarrow.NativeFile +row_group_size : int The number of rows per rowgroup {} +**kwargs : optional + Additional options for ParquetWriter """.format(_parquet_writer_arg_docs) @@ -2179,8 +2212,9 @@ def write_metadata(schema, where, metadata_collector=None, **kwargs): Parameters ---------- schema : pyarrow.Schema - where: string or pyarrow.NativeFile - metadata_collector: + where : string or pyarrow.NativeFile + metadata_collector : list + where to collect metadata information. **kwargs : dict, Additional kwargs for ParquetWriter class. See docstring for `ParquetWriter` for more information. diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 4a345878699..a3061655851 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -873,9 +873,9 @@ cdef class ExtensionScalar(Scalar): Parameters ---------- - typ: DataType + typ : DataType The extension type for the result scalar. - value: object + value : object The storage value for the result scalar. Returns diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi index 9177b2aa27b..c03721578a9 100644 --- a/python/pyarrow/serialization.pxi +++ b/python/pyarrow/serialization.pxi @@ -373,7 +373,7 @@ def serialize(object value, SerializationContext context=None): Parameters ---------- - value: object + value : object Python object for the sequence that is to be serialized. context : SerializationContext Custom serialization and deserialization context, uses a default @@ -412,9 +412,9 @@ def serialize_to(object value, sink, SerializationContext context=None): Parameters ---------- - value: object + value : object Python object for the sequence that is to be serialized. - sink: NativeFile or file-like + sink : NativeFile or file-like File the sequence will be written to. context : SerializationContext Custom serialization and deserialization context, uses a default @@ -437,9 +437,9 @@ def read_serialized(source, base=None): Parameters ---------- - source: NativeFile + source : NativeFile File to read the sequence from. - base: object + base : object This object will be the base object of all the numpy arrays contained in the sequence. @@ -478,9 +478,9 @@ def deserialize_from(source, object base, SerializationContext context=None): Parameters ---------- - source: NativeFile + source : NativeFile File to read the sequence from. - base: object + base : object This object will be the base object of all the numpy arrays contained in the sequence. context : SerializationContext diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index f22e0e4f586..9ef57439500 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1006,8 +1006,8 @@ cdef class RecordBatch(_PandasConvertible): Parameters ---------- - df: pandas.DataFrame - schema: pyarrow.Schema, optional + df : pandas.DataFrame + schema : pyarrow.Schema, optional The expected schema of the RecordBatch. This can be used to indicate the type of columns if we cannot infer it automatically. If passed, the output will have exactly this schema. Columns @@ -1043,7 +1043,7 @@ cdef class RecordBatch(_PandasConvertible): Parameters ---------- - arrays: list of pyarrow.Array + arrays : list of pyarrow.Array One for each field in RecordBatch names : list of str, optional Names for the batch fields. If not passed, schema must be passed @@ -2283,7 +2283,7 @@ def concat_tables(tables, c_bool promote=False, MemoryPool memory_pool=None): ---------- tables : iterable of pyarrow.Table objects Pyarrow tables to concatenate into a single Table. - promote: bool, default False + promote : bool, default False If True, concatenate tables with null-filling and null type promotion. memory_pool : MemoryPool, default None For memory allocations, if required, otherwise use default pool. diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 9bc24ceb473..42fd4474155 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -38,6 +38,16 @@ strides: {0.strides}""".format(self) @staticmethod def from_numpy(obj, dim_names=None): + """ + Create a Tensor from a numpy array. + + Parameters + ---------- + obj : numpy.ndarray + The source numpy array + dim_names : list, optional + Names of each dimension of the Tensor. + """ cdef: vector[c_string] c_dim_names shared_ptr[CTensor] ctensor @@ -160,6 +170,17 @@ shape: {0.shape}""".format(self) def from_numpy(data, coords, shape, dim_names=None): """ Create arrow::SparseCOOTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the rows. + coords : numpy.ndarray + Coordinates of the data. + shape : tuple + Shape of the tensor. + dim_names : list, optional + Names of the dimensions. """ cdef shared_ptr[CSparseCOOTensor] csparse_tensor cdef vector[int64_t] c_shape @@ -186,6 +207,13 @@ shape: {0.shape}""".format(self) def from_scipy(obj, dim_names=None): """ Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. """ import scipy.sparse if not isinstance(obj, scipy.sparse.coo_matrix): @@ -225,6 +253,13 @@ shape: {0.shape}""".format(self) def from_pydata_sparse(obj, dim_names=None): """ Convert pydata/sparse.COO to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : pydata.sparse.COO + The sparse multidimensional array that should be converted. + dim_names : list, optional + Names of the dimensions. """ import sparse if not isinstance(obj, sparse.COO): @@ -252,6 +287,11 @@ shape: {0.shape}""".format(self) def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : Tensor + The tensor that should be converted. """ cdef shared_ptr[CSparseCOOTensor] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) @@ -395,13 +435,34 @@ shape: {0.shape}""".format(self) def from_dense_numpy(cls, obj, dim_names=None): """ Convert numpy.ndarray to arrow::SparseCSRMatrix + + Parameters + ---------- + obj : numpy.ndarray + The dense numpy array that should be converted. + dim_names : list, optional + The names of the dimensions. """ return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) @staticmethod def from_numpy(data, indptr, indices, shape, dim_names=None): """ - Create arrow::SparseCSRMatrix from numpy.ndarrays + Create arrow::SparseCSRMatrix from numpy.ndarrays. + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse matrix. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list, optional + Names of the dimensions. """ cdef shared_ptr[CSparseCSRMatrix] csparse_tensor cdef vector[int64_t] c_shape @@ -432,6 +493,13 @@ shape: {0.shape}""".format(self) def from_scipy(obj, dim_names=None): """ Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. """ import scipy.sparse if not isinstance(obj, scipy.sparse.csr_matrix): @@ -462,6 +530,11 @@ shape: {0.shape}""".format(self) def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. """ cdef shared_ptr[CSparseCSRMatrix] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) @@ -585,6 +658,20 @@ shape: {0.shape}""".format(self) def from_numpy(data, indptr, indices, shape, dim_names=None): """ Create arrow::SparseCSCMatrix from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse matrix. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list, optional + Names of the dimensions. """ cdef shared_ptr[CSparseCSCMatrix] csparse_tensor cdef vector[int64_t] c_shape @@ -615,6 +702,13 @@ shape: {0.shape}""".format(self) def from_scipy(obj, dim_names=None): """ Convert scipy.sparse.csc_matrix to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : scipy.sparse.csc_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. """ import scipy.sparse if not isinstance(obj, scipy.sparse.csc_matrix): @@ -645,6 +739,11 @@ shape: {0.shape}""".format(self) def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. """ cdef shared_ptr[CSparseCSCMatrix] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) @@ -742,6 +841,13 @@ shape: {0.shape}""".format(self) cdef class SparseCSFTensor(_Weakrefable): """ A sparse CSF tensor. + + CSF is a generalization of compressed sparse row (CSR) index. + + CSF index recursively compresses each dimension of a tensor into a set + of prefix trees. Each path from a root to leaf forms one tensor + non-zero index. CSF is implemented with two arrays of buffers and one + arrays of integers. """ def __init__(self): @@ -771,6 +877,28 @@ shape: {0.shape}""".format(self) dim_names=None): """ Create arrow::SparseCSFTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse tensor. + indptr : numpy.ndarray + The sparsity structure. + Each two consecutive dimensions in a tensor correspond to + a buffer in indices. + A pair of consecutive values at `indptr[dim][i]` + `indptr[dim][i + 1]` signify a range of nodes in + `indices[dim + 1]` who are children of `indices[dim][i]` node. + indices : numpy.ndarray + Stores values of nodes. + Each tensor dimension corresponds to a buffer in indptr. + shape : tuple + Shape of the matrix. + axis_order : list, optional + the sequence in which dimensions were traversed to + produce the prefix tree. + dim_names : list, optional + Names of the dimensions. """ cdef shared_ptr[CSparseCSFTensor] csparse_tensor cdef vector[int64_t] c_axis_order @@ -817,6 +945,11 @@ shape: {0.shape}""".format(self) def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCSFTensor + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. """ cdef shared_ptr[CSparseCSFTensor] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 579b56b64e7..1ac3c30a381 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -650,10 +650,13 @@ def test_generated_docstrings(): memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. options : pyarrow.compute.ScalarAggregateOptions, optional - Parameters altering compute function semantics - **kwargs : optional - Parameters for ScalarAggregateOptions constructor. Either `options` - or `**kwargs` can be passed, but not both at the same time. + Parameters altering compute function semantics. + skip_nulls : optional + Parameter for ScalarAggregateOptions constructor. Either `options` + or `skip_nulls` can be passed, but not both at the same time. + min_count : optional + Parameter for ScalarAggregateOptions constructor. Either `options` + or `min_count` can be passed, but not both at the same time. """) assert pc.add.__doc__ == textwrap.dedent("""\ Add the arguments element-wise. diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index cf98486c70c..b4e0b659df5 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -751,6 +751,11 @@ cdef class BaseExtensionType(DataType): cdef class ExtensionType(BaseExtensionType): """ Concrete base class for Python-defined extension types. + + Parameters + ---------- + storage_type : DataType + extension_name : str """ def __cinit__(self): @@ -764,11 +769,6 @@ cdef class ExtensionType(BaseExtensionType): This should be called at the end of the subclass' ``__init__`` method. - - Parameters - ---------- - storage_type : DataType - extension_name : str """ cdef: shared_ptr[CExtensionType] cpy_ext_type @@ -841,6 +841,11 @@ cdef class PyExtensionType(ExtensionType): """ Concrete base class for Python-defined extension types based on pickle for (de)serialization. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. """ def __cinit__(self): @@ -880,6 +885,13 @@ cdef class UnknownExtensionType(PyExtensionType): """ A concrete class for Python-defined extension types that refer to an unknown Python implementation. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. + serialized : bytes + The serialised output. """ cdef: @@ -941,6 +953,16 @@ def unregister_extension_type(type_name): cdef class KeyValueMetadata(_Metadata, Mapping): + """ + KeyValueMetadata + + Parameters + ---------- + __arg0__ : dict + A dict of the key-value metadata + **kwargs : optional + additional key-value metadata + """ def __init__(self, __arg0__=None, **kwargs): cdef: @@ -2709,6 +2731,11 @@ def type_for_alias(name): """ Return DataType given a string alias if one exists. + Parameters + ---------- + name : str + The alias of the DataType that should be retrieved. + Returns ------- type : DataType @@ -2741,7 +2768,7 @@ def schema(fields, metadata=None): Parameters ---------- - field : iterable of Fields or tuples, or mapping of strings to DataTypes + fields : iterable of Fields or tuples, or mapping of strings to DataTypes metadata : dict, default None Keys and values must be coercible to bytes. @@ -2797,6 +2824,10 @@ def schema(fields, metadata=None): def from_numpy_dtype(object dtype): """ Convert NumPy dtype to pyarrow.DataType. + + Parameters + ---------- + dtype : the numpy dtype to convert """ cdef shared_ptr[CDataType] c_type dtype = np.dtype(dtype) @@ -2807,14 +2838,38 @@ def from_numpy_dtype(object dtype): def is_boolean_value(object obj): + """ + Check if the object is a boolean. + + Parameters + ---------- + obj : object + The object to check + """ return IsPyBool(obj) def is_integer_value(object obj): + """ + Check if the object is an integer. + + Parameters + ---------- + obj : object + The object to check + """ return IsPyInt(obj) def is_float_value(object obj): + """ + Check if the object is a float. + + Parameters + ---------- + obj : object + The object to check + """ return IsPyFloat(obj) diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 708e2bc4643..041946d66e6 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -44,6 +44,10 @@ def is_null(t): """ Return True if value is an instance of a null type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_NA @@ -51,6 +55,10 @@ def is_null(t): def is_boolean(t): """ Return True if value is an instance of a boolean type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_BOOL @@ -58,6 +66,10 @@ def is_boolean(t): def is_integer(t): """ Return True if value is an instance of any integer type. + + Parameters + ---------- + t : DataType """ return t.id in _INTEGER_TYPES @@ -65,6 +77,10 @@ def is_integer(t): def is_signed_integer(t): """ Return True if value is an instance of any signed integer type. + + Parameters + ---------- + t : DataType """ return t.id in _SIGNED_INTEGER_TYPES @@ -72,6 +88,10 @@ def is_signed_integer(t): def is_unsigned_integer(t): """ Return True if value is an instance of any unsigned integer type. + + Parameters + ---------- + t : DataType """ return t.id in _UNSIGNED_INTEGER_TYPES @@ -79,6 +99,10 @@ def is_unsigned_integer(t): def is_int8(t): """ Return True if value is an instance of an int8 type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_INT8 @@ -86,6 +110,10 @@ def is_int8(t): def is_int16(t): """ Return True if value is an instance of an int16 type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_INT16 @@ -93,6 +121,10 @@ def is_int16(t): def is_int32(t): """ Return True if value is an instance of an int32 type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_INT32 @@ -100,6 +132,10 @@ def is_int32(t): def is_int64(t): """ Return True if value is an instance of an int64 type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_INT64 @@ -107,6 +143,10 @@ def is_int64(t): def is_uint8(t): """ Return True if value is an instance of an uint8 type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_UINT8 @@ -114,6 +154,10 @@ def is_uint8(t): def is_uint16(t): """ Return True if value is an instance of an uint16 type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_UINT16 @@ -121,6 +165,10 @@ def is_uint16(t): def is_uint32(t): """ Return True if value is an instance of an uint32 type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_UINT32 @@ -128,6 +176,10 @@ def is_uint32(t): def is_uint64(t): """ Return True if value is an instance of an uint64 type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_UINT64 @@ -135,6 +187,10 @@ def is_uint64(t): def is_floating(t): """ Return True if value is an instance of a floating point numeric type. + + Parameters + ---------- + t : DataType """ return t.id in _FLOATING_TYPES @@ -142,6 +198,10 @@ def is_floating(t): def is_float16(t): """ Return True if value is an instance of a float16 (half-precision) type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_HALF_FLOAT @@ -149,6 +209,10 @@ def is_float16(t): def is_float32(t): """ Return True if value is an instance of a float32 (single precision) type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_FLOAT @@ -156,6 +220,10 @@ def is_float32(t): def is_float64(t): """ Return True if value is an instance of a float64 (double precision) type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_DOUBLE @@ -163,6 +231,10 @@ def is_float64(t): def is_list(t): """ Return True if value is an instance of a list type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_LIST @@ -170,6 +242,10 @@ def is_list(t): def is_large_list(t): """ Return True if value is an instance of a large list type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_LARGE_LIST @@ -177,6 +253,10 @@ def is_large_list(t): def is_fixed_size_list(t): """ Return True if value is an instance of a fixed size list type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_FIXED_SIZE_LIST @@ -184,6 +264,10 @@ def is_fixed_size_list(t): def is_struct(t): """ Return True if value is an instance of a struct type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_STRUCT @@ -191,6 +275,10 @@ def is_struct(t): def is_union(t): """ Return True if value is an instance of a union type. + + Parameters + ---------- + t : DataType """ return t.id in _UNION_TYPES @@ -198,6 +286,10 @@ def is_union(t): def is_nested(t): """ Return True if value is an instance of a nested type. + + Parameters + ---------- + t : DataType """ return t.id in _NESTED_TYPES @@ -205,6 +297,10 @@ def is_nested(t): def is_temporal(t): """ Return True if value is an instance of date, time, timestamp or duration. + + Parameters + ---------- + t : DataType """ return t.id in _TEMPORAL_TYPES @@ -212,6 +308,10 @@ def is_temporal(t): def is_timestamp(t): """ Return True if value is an instance of a timestamp type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_TIMESTAMP @@ -219,6 +319,10 @@ def is_timestamp(t): def is_duration(t): """ Return True if value is an instance of a duration type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_DURATION @@ -226,6 +330,10 @@ def is_duration(t): def is_time(t): """ Return True if value is an instance of a time type. + + Parameters + ---------- + t : DataType """ return t.id in _TIME_TYPES @@ -233,6 +341,10 @@ def is_time(t): def is_time32(t): """ Return True if value is an instance of a time32 type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_TIME32 @@ -240,6 +352,10 @@ def is_time32(t): def is_time64(t): """ Return True if value is an instance of a time64 type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_TIME64 @@ -247,6 +363,10 @@ def is_time64(t): def is_binary(t): """ Return True if value is an instance of a variable-length binary type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_BINARY @@ -255,6 +375,10 @@ def is_large_binary(t): """ Return True if value is an instance of a large variable-length binary type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_LARGE_BINARY @@ -262,6 +386,10 @@ def is_large_binary(t): def is_unicode(t): """ Alias for is_string. + + Parameters + ---------- + t : DataType """ return is_string(t) @@ -269,6 +397,10 @@ def is_unicode(t): def is_string(t): """ Return True if value is an instance of string (utf8 unicode) type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_STRING @@ -276,6 +408,10 @@ def is_string(t): def is_large_unicode(t): """ Alias for is_large_string. + + Parameters + ---------- + t : DataType """ return is_large_string(t) @@ -283,6 +419,10 @@ def is_large_unicode(t): def is_large_string(t): """ Return True if value is an instance of large string (utf8 unicode) type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_LARGE_STRING @@ -290,6 +430,10 @@ def is_large_string(t): def is_fixed_size_binary(t): """ Return True if value is an instance of a fixed size binary type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_FIXED_SIZE_BINARY @@ -297,6 +441,10 @@ def is_fixed_size_binary(t): def is_date(t): """ Return True if value is an instance of a date type. + + Parameters + ---------- + t : DataType """ return t.id in _DATE_TYPES @@ -304,6 +452,10 @@ def is_date(t): def is_date32(t): """ Return True if value is an instance of a date32 (days) type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_DATE32 @@ -311,6 +463,10 @@ def is_date32(t): def is_date64(t): """ Return True if value is an instance of a date64 (milliseconds) type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_DATE64 @@ -318,6 +474,10 @@ def is_date64(t): def is_map(t): """ Return True if value is an instance of a map logical type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_MAP @@ -325,6 +485,10 @@ def is_map(t): def is_decimal(t): """ Return True if value is an instance of a decimal type. + + Parameters + ---------- + t : DataType """ return t.id in _DECIMAL_TYPES @@ -332,6 +496,10 @@ def is_decimal(t): def is_decimal128(t): """ Return True if value is an instance of a decimal type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_DECIMAL128 @@ -339,6 +507,10 @@ def is_decimal128(t): def is_decimal256(t): """ Return True if value is an instance of a decimal type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_DECIMAL256 @@ -346,6 +518,10 @@ def is_decimal256(t): def is_dictionary(t): """ Return True if value is an instance of a dictionary-encoded type. + + Parameters + ---------- + t : DataType """ return t.id == lib.Type_DICTIONARY @@ -353,5 +529,9 @@ def is_dictionary(t): def is_primitive(t): """ Return True if the value is an instance of a primitive type. + + Parameters + ---------- + t : DataType """ return lib._is_primitive(t.id)