From 1e9d6ed362c8b4040f564539dd739bc6f14ec443 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 24 Sep 2021 11:44:25 +0200 Subject: [PATCH 01/25] numpydoc fix for _json module --- python/pyarrow/_json.pyx | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/python/pyarrow/_json.pyx b/python/pyarrow/_json.pyx index 183bd4fdd4d..1c08e546ec9 100644 --- a/python/pyarrow/_json.pyx +++ b/python/pyarrow/_json.pyx @@ -86,12 +86,12 @@ cdef class ParseOptions(_Weakrefable): Parameters ---------- - explicit_schema: Schema, optional (default None) + explicit_schema : Schema, optional (default None) Optional explicit schema (no type inference, ignores other fields). - newlines_in_values: bool, optional (default False) + newlines_in_values : bool, optional (default False) Whether objects may be printed across multiple lines (for example pretty printed). If false, input must end with an empty line. - unexpected_field_behavior: str, default "infer" + unexpected_field_behavior : str, default "infer" How JSON fields outside of explicit_schema (if given) are treated. Possible behaviors: @@ -211,16 +211,16 @@ def read_json(input_file, read_options=None, parse_options=None, Parameters ---------- - input_file: string, path or file-like object + input_file : str, path or file-like object The location of JSON data. Currently only the line-delimited JSON format is supported. - read_options: pyarrow.json.ReadOptions, optional - Options for the JSON reader (see ReadOptions constructor for defaults) - parse_options: pyarrow.json.ParseOptions, optional + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see ReadOptions constructor for defaults). + parse_options : pyarrow.json.ParseOptions, optional Options for the JSON parser - (see ParseOptions constructor for defaults) - memory_pool: MemoryPool, optional - Pool to allocate Table memory from + (see ParseOptions constructor for defaults). + memory_pool : MemoryPool, optional + Pool to allocate Table memory from. Returns ------- From f5584ba3cdce082cf6b2859b5d866093d194a7ac Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 24 Sep 2021 11:51:30 +0200 Subject: [PATCH 02/25] numpy doc _csv module --- python/pyarrow/_csv.pyx | 85 +++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index 950e2d5464c..19ade43249c 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -59,10 +59,10 @@ cdef class ReadOptions(_Weakrefable): This will determine multi-threading granularity as well as the size of individual record batches or table chunks. Minimum valid value for block size is 1 - skip_rows: int, optional (default 0) + skip_rows : int, optional (default 0) The number of rows to skip before the column names (if any) and the CSV data. - skip_rows_after_names: int, optional (default 0) + skip_rows_after_names : int, optional (default 0) The number of rows to skip after the column names. This number can be larger than the number of rows in one block, and empty rows are counted. @@ -70,15 +70,15 @@ cdef class ReadOptions(_Weakrefable): - `skip_rows` is applied (if non-zero); - column names aread (unless `column_names` is set); - `skip_rows_after_names` is applied (if non-zero). - column_names: list, optional + column_names : list, optional The column names of the target table. If empty, fall back on `autogenerate_column_names`. - autogenerate_column_names: bool, optional (default False) + autogenerate_column_names : bool, optional (default False) Whether to autogenerate column names if `column_names` is empty. If true, column names will be of the form "f0", "f1"... If false, column names will be read from the first CSV row after `skip_rows`. - encoding: str, optional (default 'utf8') + encoding : str, optional (default 'utf8') The character encoding of the CSV data. Columns that cannot decode using this encoding can still be read as Binary. """ @@ -235,22 +235,22 @@ cdef class ParseOptions(_Weakrefable): Parameters ---------- - delimiter: 1-character string, optional (default ',') + delimiter : 1-character string, optional (default ',') The character delimiting individual cells in the CSV data. - quote_char: 1-character string or False, optional (default '"') + quote_char : 1-character string or False, optional (default '"') The character used optionally for quoting CSV values (False if quoting is not allowed). - double_quote: bool, optional (default True) + double_quote : bool, optional (default True) Whether two quotes in a quoted CSV value denote a single quote in the data. - escape_char: 1-character string or False, optional (default False) + escape_char : 1-character string or False, optional (default False) The character used optionally for escaping special characters (False if escaping is not allowed). - newlines_in_values: bool, optional (default False) + newlines_in_values : bool, optional (default False) Whether newline characters are allowed in CSV values. Setting this to True reduces the performance of multi-threaded CSV reading. - ignore_empty_lines: bool, optional (default True) + ignore_empty_lines : bool, optional (default True) Whether empty lines are ignored in CSV input. If False, an empty line is interpreted as containing a single empty value (assuming a one-column CSV file). @@ -423,53 +423,53 @@ cdef class ConvertOptions(_Weakrefable): ---------- check_utf8 : bool, optional (default True) Whether to check UTF8 validity of string columns. - column_types: pa.Schema or dict, optional + column_types : pa.Schema or dict, optional Explicitly map column names to column types. Passing this argument disables type inference on the defined columns. - null_values: list, optional + null_values : list, optional A sequence of strings that denote nulls in the data (defaults are appropriate in most cases). Note that by default, string columns are not checked for null values. To enable null checking for those, specify ``strings_can_be_null=True``. - true_values: list, optional + true_values : list, optional A sequence of strings that denote true booleans in the data (defaults are appropriate in most cases). - false_values: list, optional + false_values : list, optional A sequence of strings that denote false booleans in the data (defaults are appropriate in most cases). - decimal_point: 1-character string, optional (default '.') + decimal_point : 1-character string, optional (default '.') The character used as decimal point in floating-point and decimal data. - timestamp_parsers: list, optional + timestamp_parsers : list, optional A sequence of strptime()-compatible format strings, tried in order when attempting to infer or convert timestamp values (the special value ISO8601() can also be given). By default, a fast built-in ISO-8601 parser is used. - strings_can_be_null: bool, optional (default False) + strings_can_be_null : bool, optional (default False) Whether string / binary columns can have null values. If true, then strings in null_values are considered null for string columns. If false, then all strings are valid string values. - quoted_strings_can_be_null: bool, optional (default True) + quoted_strings_can_be_null : bool, optional (default True) Whether quoted values can be null. If true, then strings in "null_values" are also considered null when they appear quoted in the CSV file. Otherwise, quoted values are never considered null. - auto_dict_encode: bool, optional (default False) + auto_dict_encode : bool, optional (default False) Whether to try to automatically dict-encode string / binary data. If true, then when type inference detects a string or binary column, it it dict-encoded up to `auto_dict_max_cardinality` distinct values (per chunk), after which it switches to regular encoding. This setting is ignored for non-inferred columns (those in `column_types`). - auto_dict_max_cardinality: int, optional + auto_dict_max_cardinality : int, optional The maximum dictionary cardinality for `auto_dict_encode`. This value is per chunk. - include_columns: list, optional + include_columns : list, optional The names of columns to include in the Table. If empty, the Table will include all columns from the CSV file. If not empty, only these columns will be included, in this order. - include_missing_columns: bool, optional (default False) + include_missing_columns : bool, optional (default False) If false, columns in `include_columns` but not in the CSV file will error out. If true, columns in `include_columns` but not in the CSV file will @@ -848,20 +848,20 @@ def read_csv(input_file, read_options=None, parse_options=None, Parameters ---------- - input_file: string, path or file-like object + input_file : string, path or file-like object The location of CSV data. If a string or path, and if it ends with a recognized compressed file extension (e.g. ".gz" or ".bz2"), the data is automatically decompressed when reading. - read_options: pyarrow.csv.ReadOptions, optional + read_options : pyarrow.csv.ReadOptions, optional Options for the CSV reader (see pyarrow.csv.ReadOptions constructor for defaults) - parse_options: pyarrow.csv.ParseOptions, optional + parse_options : pyarrow.csv.ParseOptions, optional Options for the CSV parser (see pyarrow.csv.ParseOptions constructor for defaults) - convert_options: pyarrow.csv.ConvertOptions, optional + convert_options : pyarrow.csv.ConvertOptions, optional Options for converting CSV data (see pyarrow.csv.ConvertOptions constructor for defaults) - memory_pool: MemoryPool, optional + memory_pool : MemoryPool, optional Pool to allocate Table memory from Returns @@ -906,20 +906,20 @@ def open_csv(input_file, read_options=None, parse_options=None, Parameters ---------- - input_file: string, path or file-like object + input_file : string, path or file-like object The location of CSV data. If a string or path, and if it ends with a recognized compressed file extension (e.g. ".gz" or ".bz2"), the data is automatically decompressed when reading. - read_options: pyarrow.csv.ReadOptions, optional + read_options : pyarrow.csv.ReadOptions, optional Options for the CSV reader (see pyarrow.csv.ReadOptions constructor for defaults) - parse_options: pyarrow.csv.ParseOptions, optional + parse_options : pyarrow.csv.ParseOptions, optional Options for the CSV parser (see pyarrow.csv.ParseOptions constructor for defaults) - convert_options: pyarrow.csv.ConvertOptions, optional + convert_options : pyarrow.csv.ConvertOptions, optional Options for converting CSV data (see pyarrow.csv.ConvertOptions constructor for defaults) - memory_pool: MemoryPool, optional + memory_pool : MemoryPool, optional Pool to allocate Table memory from Returns @@ -1014,13 +1014,13 @@ def write_csv(data, output_file, write_options=None, Parameters ---------- - data: pyarrow.RecordBatch or pyarrow.Table + data : pyarrow.RecordBatch or pyarrow.Table The data to write. - output_file: string, path, pyarrow.NativeFile, or file-like object + output_file : string, path, pyarrow.NativeFile, or file-like object The location where to write the CSV data. - write_options: pyarrow.csv.WriteOptions + write_options : pyarrow.csv.WriteOptions Options to configure writing the CSV data. - memory_pool: MemoryPool, optional + memory_pool : MemoryPool, optional Pool for temporary allocations. """ cdef: @@ -1047,17 +1047,18 @@ def write_csv(data, output_file, write_options=None, cdef class CSVWriter(_CRecordBatchWriter): - """Writer to create a CSV file. + """ + Writer to create a CSV file. Parameters ---------- - sink: string, path, pyarrow.OutputStream or file-like object + sink : str, path, pyarrow.OutputStream or file-like object The location where to write the CSV data. - schema: pyarrow.Schema + schema : pyarrow.Schema The schema of the data to be written. - write_options: pyarrow.csv.WriteOptions + write_options : pyarrow.csv.WriteOptions Options to configure writing the CSV data. - memory_pool: MemoryPool, optional + memory_pool : MemoryPool, optional Pool for temporary allocations. """ From 323a82391c8bfd1e72949a7e252c1dcabaefdb3b Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 24 Sep 2021 12:03:19 +0200 Subject: [PATCH 03/25] Document types --- python/pyarrow/types.py | 225 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 708e2bc4643..5f76cbc7f88 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -44,6 +44,11 @@ def is_null(t): """ Return True if value is an instance of a null type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_NA @@ -51,6 +56,11 @@ def is_null(t): def is_boolean(t): """ Return True if value is an instance of a boolean type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_BOOL @@ -58,6 +68,11 @@ def is_boolean(t): def is_integer(t): """ Return True if value is an instance of any integer type. + + Parameters + ---------- + t : DataType + type to check """ return t.id in _INTEGER_TYPES @@ -65,6 +80,11 @@ def is_integer(t): def is_signed_integer(t): """ Return True if value is an instance of any signed integer type. + + Parameters + ---------- + t : DataType + type to check """ return t.id in _SIGNED_INTEGER_TYPES @@ -72,6 +92,11 @@ def is_signed_integer(t): def is_unsigned_integer(t): """ Return True if value is an instance of any unsigned integer type. + + Parameters + ---------- + t : DataType + type to check """ return t.id in _UNSIGNED_INTEGER_TYPES @@ -79,6 +104,11 @@ def is_unsigned_integer(t): def is_int8(t): """ Return True if value is an instance of an int8 type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_INT8 @@ -86,6 +116,11 @@ def is_int8(t): def is_int16(t): """ Return True if value is an instance of an int16 type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_INT16 @@ -93,6 +128,11 @@ def is_int16(t): def is_int32(t): """ Return True if value is an instance of an int32 type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_INT32 @@ -100,6 +140,11 @@ def is_int32(t): def is_int64(t): """ Return True if value is an instance of an int64 type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_INT64 @@ -107,6 +152,11 @@ def is_int64(t): def is_uint8(t): """ Return True if value is an instance of an uint8 type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_UINT8 @@ -114,6 +164,11 @@ def is_uint8(t): def is_uint16(t): """ Return True if value is an instance of an uint16 type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_UINT16 @@ -121,6 +176,11 @@ def is_uint16(t): def is_uint32(t): """ Return True if value is an instance of an uint32 type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_UINT32 @@ -128,6 +188,11 @@ def is_uint32(t): def is_uint64(t): """ Return True if value is an instance of an uint64 type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_UINT64 @@ -135,6 +200,11 @@ def is_uint64(t): def is_floating(t): """ Return True if value is an instance of a floating point numeric type. + + Parameters + ---------- + t : DataType + type to check """ return t.id in _FLOATING_TYPES @@ -142,6 +212,11 @@ def is_floating(t): def is_float16(t): """ Return True if value is an instance of a float16 (half-precision) type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_HALF_FLOAT @@ -149,6 +224,11 @@ def is_float16(t): def is_float32(t): """ Return True if value is an instance of a float32 (single precision) type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_FLOAT @@ -156,6 +236,11 @@ def is_float32(t): def is_float64(t): """ Return True if value is an instance of a float64 (double precision) type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_DOUBLE @@ -163,6 +248,11 @@ def is_float64(t): def is_list(t): """ Return True if value is an instance of a list type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_LIST @@ -170,6 +260,11 @@ def is_list(t): def is_large_list(t): """ Return True if value is an instance of a large list type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_LARGE_LIST @@ -177,6 +272,11 @@ def is_large_list(t): def is_fixed_size_list(t): """ Return True if value is an instance of a fixed size list type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_FIXED_SIZE_LIST @@ -184,6 +284,11 @@ def is_fixed_size_list(t): def is_struct(t): """ Return True if value is an instance of a struct type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_STRUCT @@ -191,6 +296,11 @@ def is_struct(t): def is_union(t): """ Return True if value is an instance of a union type. + + Parameters + ---------- + t : DataType + type to check """ return t.id in _UNION_TYPES @@ -198,6 +308,11 @@ def is_union(t): def is_nested(t): """ Return True if value is an instance of a nested type. + + Parameters + ---------- + t : DataType + type to check """ return t.id in _NESTED_TYPES @@ -205,6 +320,11 @@ def is_nested(t): def is_temporal(t): """ Return True if value is an instance of date, time, timestamp or duration. + + Parameters + ---------- + t : DataType + type to check """ return t.id in _TEMPORAL_TYPES @@ -212,6 +332,11 @@ def is_temporal(t): def is_timestamp(t): """ Return True if value is an instance of a timestamp type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_TIMESTAMP @@ -219,6 +344,11 @@ def is_timestamp(t): def is_duration(t): """ Return True if value is an instance of a duration type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_DURATION @@ -226,6 +356,11 @@ def is_duration(t): def is_time(t): """ Return True if value is an instance of a time type. + + Parameters + ---------- + t : DataType + type to check """ return t.id in _TIME_TYPES @@ -233,6 +368,11 @@ def is_time(t): def is_time32(t): """ Return True if value is an instance of a time32 type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_TIME32 @@ -240,6 +380,11 @@ def is_time32(t): def is_time64(t): """ Return True if value is an instance of a time64 type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_TIME64 @@ -247,6 +392,11 @@ def is_time64(t): def is_binary(t): """ Return True if value is an instance of a variable-length binary type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_BINARY @@ -255,6 +405,11 @@ def is_large_binary(t): """ Return True if value is an instance of a large variable-length binary type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_LARGE_BINARY @@ -262,6 +417,11 @@ def is_large_binary(t): def is_unicode(t): """ Alias for is_string. + + Parameters + ---------- + t : DataType + type to check """ return is_string(t) @@ -269,6 +429,11 @@ def is_unicode(t): def is_string(t): """ Return True if value is an instance of string (utf8 unicode) type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_STRING @@ -276,6 +441,11 @@ def is_string(t): def is_large_unicode(t): """ Alias for is_large_string. + + Parameters + ---------- + t : DataType + type to check """ return is_large_string(t) @@ -283,6 +453,11 @@ def is_large_unicode(t): def is_large_string(t): """ Return True if value is an instance of large string (utf8 unicode) type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_LARGE_STRING @@ -290,6 +465,11 @@ def is_large_string(t): def is_fixed_size_binary(t): """ Return True if value is an instance of a fixed size binary type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_FIXED_SIZE_BINARY @@ -297,6 +477,11 @@ def is_fixed_size_binary(t): def is_date(t): """ Return True if value is an instance of a date type. + + Parameters + ---------- + t : DataType + type to check """ return t.id in _DATE_TYPES @@ -304,6 +489,11 @@ def is_date(t): def is_date32(t): """ Return True if value is an instance of a date32 (days) type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_DATE32 @@ -311,6 +501,11 @@ def is_date32(t): def is_date64(t): """ Return True if value is an instance of a date64 (milliseconds) type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_DATE64 @@ -318,6 +513,11 @@ def is_date64(t): def is_map(t): """ Return True if value is an instance of a map logical type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_MAP @@ -325,6 +525,11 @@ def is_map(t): def is_decimal(t): """ Return True if value is an instance of a decimal type. + + Parameters + ---------- + t : DataType + type to check """ return t.id in _DECIMAL_TYPES @@ -332,6 +537,11 @@ def is_decimal(t): def is_decimal128(t): """ Return True if value is an instance of a decimal type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_DECIMAL128 @@ -339,6 +549,11 @@ def is_decimal128(t): def is_decimal256(t): """ Return True if value is an instance of a decimal type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_DECIMAL256 @@ -346,6 +561,11 @@ def is_decimal256(t): def is_dictionary(t): """ Return True if value is an instance of a dictionary-encoded type. + + Parameters + ---------- + t : DataType + type to check """ return t.id == lib.Type_DICTIONARY @@ -353,5 +573,10 @@ def is_dictionary(t): def is_primitive(t): """ Return True if the value is an instance of a primitive type. + + Parameters + ---------- + t : DataType + type to check """ return lib._is_primitive(t.id) From a92e6aa5f6935b7b9da2713bc6e82d22fb0ca190 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 24 Sep 2021 12:29:20 +0200 Subject: [PATCH 04/25] Fix compute functions docstring --- python/pyarrow/compute.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 8640ea683da..53c49fb5cda 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -132,11 +132,15 @@ def _decorate_compute_function(wrapper, exposed_name, func, option_class): if option_class is not None: doc_pieces.append("""\ options : pyarrow.compute.{0}, optional - Parameters altering compute function semantics - **kwargs : optional - Parameters for {0} constructor. Either `options` - or `**kwargs` can be passed, but not both at the same time. + Parameters altering compute function semantics. """.format(option_class.__name__)) + options_sig = inspect.signature(option_class) + for p in options_sig.parameters.values(): + doc_pieces.append("""\ + {0} : optional + Parameter for {1} constructor. Either `options` + or `{0}` can be passed, but not both at the same time. + """.format(p.name, option_class.__name__)) wrapper.__doc__ = "".join(dedent(s) for s in doc_pieces) return wrapper From 758df9d20ba1ec85a196311168def52fee4a5e6e Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 24 Sep 2021 15:02:35 +0200 Subject: [PATCH 05/25] Parquet docstrings --- python/pyarrow/parquet.py | 70 +++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 78128dbf2b9..682c44e2e25 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -214,6 +214,8 @@ class ParquetFile: Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3). If True, Arrow will use a background I/O thread pool. + read_dictionary : list + List of names to read directly as DictionaryArray coerce_int96_timestamp_unit : str, default None. Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' @@ -283,7 +285,9 @@ def read_row_group(self, i, columns=None, use_threads=True, Parameters ---------- - columns: list + i : int + Index of the individual row group that we want to read. + columns : list If not None, only these columns will be read from the row group. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. @@ -310,9 +314,9 @@ def read_row_groups(self, row_groups, columns=None, use_threads=True, Parameters ---------- - row_groups: list + row_groups : list Only these row groups will be read from the file. - columns: list + columns : list If not None, only these columns will be read from the row group. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. @@ -340,12 +344,12 @@ def iter_batches(self, batch_size=65536, row_groups=None, columns=None, Parameters ---------- - batch_size: int, default 64K + batch_size : int, default 64K Maximum number of records to yield per batch. Batches may be smaller if there aren't enough rows in the file. - row_groups: list + row_groups : list Only these row groups will be read from the file. - columns: list + columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. @@ -377,7 +381,7 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False): Parameters ---------- - columns: list + columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. @@ -542,7 +546,7 @@ def _sanitize_table(table, new_schema, flavor): filesystem : FileSystem, default None If nothing passed, will be inferred from `where` if path-like, else `where` is already a file-like object so no filesystem is needed. -compression_level: int or dict, default None +compression_level : int or dict, default None Specify the compression level for a codec, either on a general basis or per-column. If None is passed, arrow selects the compression level for the compression codec in use. The compression level has a different @@ -550,7 +554,7 @@ def _sanitize_table(table, new_schema, flavor): codec you are using. An exception is thrown if the compression codec does not allow specifying a compression level. -use_byte_stream_split: bool or list, default False +use_byte_stream_split : bool or list, default False Specify if the byte_stream_split encoding should be used in general or only for some columns. If both dictionary and byte_stream_stream are enabled, then dictionary is preferred. @@ -560,7 +564,7 @@ def _sanitize_table(table, new_schema, flavor): The serialized Parquet data page format version to write, defaults to 1.0. This does not impact the file schema logical types and Arrow to Parquet type casting behavior; for that use the "version" option. -use_compliant_nested_type: bool, default False +use_compliant_nested_type : bool, default False Whether to write compliant Parquet nested type (lists) as defined `here `_, defaults to ``False``. @@ -597,6 +601,7 @@ class ParquetWriter: where : path or file-like object schema : arrow Schema {} +writer_engine_version : unused **options : dict If options contains a key `metadata_collector` then the corresponding value is assumed to be a list (or any object with @@ -738,6 +743,8 @@ class ParquetDatasetPiece: Two-element tuples of ``(column name, ordinal index)``. row_group : int, default None Row group to load. By default, reads all row groups. + file_options : dict + Options """ def __init__(self, path, open_file_func=partial(open, mode='rb'), @@ -826,6 +833,8 @@ def read(self, columns=None, use_threads=True, partitions=None, partitions : ParquetPartitions, default None file : file-like object Passed to ParquetFile. + use_pandas_metadata : bool + If pandas metadata should be used or not. Returns ------- @@ -892,6 +901,13 @@ class PartitionSet: Then we have two partition sets, one for foo, another for bar. As we visit levels of the partition hierarchy, a PartitionSet tracks the distinct values and assigns categorical codes to use when reading the pieces + + Parameters + ---------- + name : str + Name of the partition set. Under which key to collect all values. + keys : list + All possible values that have been collected for that partition set. """ def __init__(self, name, keys=None): @@ -904,6 +920,10 @@ def get_index(self, key): """ Get the index of the partition value if it is known, otherwise assign one + + Parameters + ---------- + key : The value for which we want to known the index. """ if key in self.key_indices: return self.key_indices[key] @@ -1248,7 +1268,7 @@ class ParquetDataset: and different partitioning schemes are supported. {1} -metadata_nthreads: int, default 1 +metadata_nthreads : int, default 1 How many threads to allow the thread pool which is used to read the dataset metadata. Increasing this is helpful to read partitioned datasets. @@ -1463,6 +1483,11 @@ def read_pandas(self, **kwargs): Read dataset including pandas metadata, if any. Other arguments passed through to ParquetDataset.read, see docstring for further details. + Parameters + ---------- + **kwargs : optional + All additional options to pass to the reader. + Returns ------- pyarrow.Table @@ -1792,11 +1817,11 @@ def filesystem(self): Parameters ---------- -source: str, pyarrow.NativeFile, or file-like object +source : str, pyarrow.NativeFile, or file-like object If a string passed, can be a single file name or directory name. For file-like objects, only read a single file. Use pyarrow.BufferReader to read a file contained in a bytes or buffer-like object. -columns: list +columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. @@ -1837,6 +1862,11 @@ def filesystem(self): use_legacy_dataset=False. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. +coerce_int96_timestamp_unit : str, default None. + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be infered as timestamps + in nanoseconds. Returns ------- @@ -1947,7 +1977,8 @@ def read_pandas(source, columns=None, **kwargs): read_pandas.__doc__ = _read_table_docstring.format( 'Read a Table from Parquet format, also reading DataFrame\n' 'index values if known in the file metadata', - _read_docstring_common, + "\n".join((_read_docstring_common, + """**kwargs : additional options for :func:`read_table`""")), """pyarrow.Table Content of the file as a Table of Columns, including DataFrame indexes as columns""", @@ -2003,10 +2034,12 @@ def write_table(table, where, row_group_size=None, version='1.0', Parameters ---------- table : pyarrow.Table -where: string or pyarrow.NativeFile -row_group_size: int +where : string or pyarrow.NativeFile +row_group_size : int The number of rows per rowgroup {} +**kwargs : optional + Additional options for ParquetWriter """.format(_parquet_writer_arg_docs) @@ -2179,8 +2212,9 @@ def write_metadata(schema, where, metadata_collector=None, **kwargs): Parameters ---------- schema : pyarrow.Schema - where: string or pyarrow.NativeFile - metadata_collector: + where : string or pyarrow.NativeFile + metadata_collector : list + where to collect metadata information. **kwargs : dict, Additional kwargs for ParquetWriter class. See docstring for `ParquetWriter` for more information. From 62c46cd33d418d96679dd33a484781a8b9870965 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Mon, 27 Sep 2021 14:02:52 +0200 Subject: [PATCH 06/25] Address _fs docstrings --- python/pyarrow/_fs.pyx | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index bb639c09719..5e55afcd408 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -597,15 +597,15 @@ cdef class FileSystem(_Weakrefable): Parameters ---------- - source: str + source : str The source to open for reading. - compression: str optional, default 'detect' + compression : str optional, default 'detect' The compression algorithm to use for on-the-fly decompression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: int optional, default None + buffer_size : int optional, default None If None or 0, no buffering will happen. Otherwise the size of the temporary read buffer. @@ -639,16 +639,16 @@ cdef class FileSystem(_Weakrefable): ---------- path : str The source to open for writing. - compression: str optional, default 'detect' + compression : str optional, default 'detect' The compression algorithm to use for on-the-fly compression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: int optional, default None + buffer_size : int optional, default None If None or 0, no buffering will happen. Otherwise the size of the temporary write buffer. - metadata: dict optional, default None + metadata : dict optional, default None If not None, a mapping of string keys to string values. Some filesystems support storing metadata along the file (such as "Content-Type"). @@ -693,16 +693,16 @@ cdef class FileSystem(_Weakrefable): ---------- path : str The source to open for writing. - compression: str optional, default 'detect' + compression : str optional, default 'detect' The compression algorithm to use for on-the-fly compression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: int optional, default None + buffer_size : int optional, default None If None or 0, no buffering will happen. Otherwise the size of the temporary write buffer. - metadata: dict optional, default None + metadata : dict optional, default None If not None, a mapping of string keys to string values. Some filesystems support storing metadata along the file (such as "Content-Type"). @@ -768,7 +768,7 @@ cdef class LocalFileSystem(FileSystem): Parameters ---------- - use_mmap: bool, default False + use_mmap : bool, default False Whether open_input_stream and open_input_file should return a mmap'ed file or a regular file. """ @@ -813,9 +813,9 @@ cdef class SubTreeFileSystem(FileSystem): Parameters ---------- - base_path: str + base_path : str The root of the subtree. - base_fs: FileSystem + base_fs : FileSystem FileSystem object the operations delegated to. """ @@ -934,36 +934,42 @@ class FileSystemHandler(ABC): """ Implement PyFileSystem.type_name. """ + get_type_name.__doc__ = FileSystem.get_type_name.__doc__ @abstractmethod def get_file_info(self, paths): """ Implement PyFileSystem.get_file_info(paths). """ + get_file_info.__doc__ = FileSystem.get_file_info.__doc__ @abstractmethod def get_file_info_selector(self, selector): """ Implement PyFileSystem.get_file_info(selector). """ + get_file_info_selector.__doc__ = FileSystem.get_file_info_selector.__doc__ @abstractmethod def create_dir(self, path, recursive): """ Implement PyFileSystem.create_dir(...). """ + create_dir.__doc__ = FileSystem.create_dir.__doc__ @abstractmethod def delete_dir(self, path): """ Implement PyFileSystem.delete_dir(...). """ + delete_dir.__doc__ = FileSystem.delete_dir.__doc__ @abstractmethod def delete_dir_contents(self, path): """ Implement PyFileSystem.delete_dir_contents(...). """ + delete_dir_contents.__doc__ = FileSystem.delete_dir_contents.__doc__ @abstractmethod def delete_root_dir_contents(self): @@ -976,48 +982,56 @@ class FileSystemHandler(ABC): """ Implement PyFileSystem.delete_file(...). """ + delete_file.__doc__ = FileSystem.delete_file.__doc__ @abstractmethod def move(self, src, dest): """ Implement PyFileSystem.move(...). """ + move.__doc__ = FileSystem.move.__doc__ @abstractmethod def copy_file(self, src, dest): """ Implement PyFileSystem.copy_file(...). """ + copy_file.__doc__ = FileSystem.copy_file.__doc__ @abstractmethod def open_input_stream(self, path): """ Implement PyFileSystem.open_input_stream(...). """ + open_input_stream.__doc__ = FileSystem.open_input_stream.__doc__ @abstractmethod def open_input_file(self, path): """ Implement PyFileSystem.open_input_file(...). """ + open_input_file.__doc__ = FileSystem.open_input_file.__doc__ @abstractmethod def open_output_stream(self, path, metadata): """ Implement PyFileSystem.open_output_stream(...). """ + open_output_stream.__doc__ = FileSystem.open_output_stream.__doc__ @abstractmethod def open_append_stream(self, path, metadata): """ Implement PyFileSystem.open_append_stream(...). """ + open_append_stream.__doc__ = FileSystem.open_append_stream.__doc__ @abstractmethod def normalize_path(self, path): """ Implement PyFileSystem.normalize_path(...). """ + normalize_path.__doc__ = FileSystem.normalize_path.__doc__ # Callback definitions for CPyFileSystemVtable From cefdcf8b1e861dd19279f97d4d80e33ceafdd3f1 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Mon, 27 Sep 2021 14:47:47 +0200 Subject: [PATCH 07/25] Fix pyarrow.ipc docstrings --- python/pyarrow/ipc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py index 049d0c95c4b..cb28a0b5fd4 100644 --- a/python/pyarrow/ipc.py +++ b/python/pyarrow/ipc.py @@ -220,7 +220,7 @@ def deserialize_pandas(buf, *, use_threads=True): ---------- buf : buffer An object compatible with the buffer protocol. - use_threads: bool, default True + use_threads : bool, default True Whether to parallelize the conversion using multiple threads. Returns From c5b034dcf0ca90dab43ee18737b6a5826ce78d95 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Mon, 27 Sep 2021 15:01:26 +0200 Subject: [PATCH 08/25] Fix feather and IPC --- python/pyarrow/feather.py | 2 +- python/pyarrow/ipc.pxi | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 1e6875ac08e..225992dc514 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -205,7 +205,7 @@ def read_feather(source, columns=None, use_threads=True, memory_map=True): columns : sequence, optional Only read a specific set of columns. If not provided, all columns are read. - use_threads: bool, default True + use_threads : bool, default True Whether to parallelize reading using multiple threads. memory_map : boolean, default True Use memory mapping when opening file on disk diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 4b22acc076f..61ac4250a60 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -85,26 +85,27 @@ cdef _wrap_read_stats(CIpcReadStats c): cdef class IpcWriteOptions(_Weakrefable): - """Serialization options for the IPC format. + """ + Serialization options for the IPC format. Parameters ---------- metadata_version : MetadataVersion, default MetadataVersion.V5 The metadata version to write. V5 is the current and latest, V4 is the pre-1.0 metadata version (with incompatible Union layout). - allow_64bit: bool, default False + allow_64bit : bool, default False If true, allow field lengths that don't fit in a signed 32-bit int. use_legacy_format : bool, default False Whether to use the pre-Arrow 0.15 IPC format. - compression: str, Codec, or None + compression : str, Codec, or None compression codec to use for record batch buffers. If None then batch buffers will be uncompressed. Must be "lz4", "zstd" or None. To specify a compression_level use `pyarrow.Codec` - use_threads: bool + use_threads : bool Whether to use the global CPU thread pool to parallelize any computational tasks like compression. - emit_dictionary_deltas: bool + emit_dictionary_deltas : bool Whether to emit dictionary deltas. Default is false for maximum stream compatibility. """ From d0d90ac8c6285d816239b073408304efb03534d8 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Tue, 28 Sep 2021 10:33:44 +0200 Subject: [PATCH 09/25] Document tensor and matrixes --- python/pyarrow/io.pxi | 48 +++++++++++++++ python/pyarrow/ipc.pxi | 10 ++++ python/pyarrow/tensor.pxi | 122 +++++++++++++++++++++++++++++++++++++- python/pyarrow/types.pxi | 24 ++++++++ 4 files changed, 203 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 7d7cb1afb00..29416190d31 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1307,6 +1307,18 @@ ctypedef CRandomAccessFile* _RandomAccessFilePtr cdef class BufferedInputStream(NativeFile): + """ + Wraps an input stream making it buffered. + + Parameters + ---------- + stream : NativeFile + The stream to wrap with the buffer + buffer_size : int + Size of the buffer that should be added. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ def __init__(self, NativeFile stream, int buffer_size, MemoryPool memory_pool=None): @@ -1357,6 +1369,18 @@ cdef class BufferedInputStream(NativeFile): cdef class BufferedOutputStream(NativeFile): + """ + Wraps an output stream making it buffered. + + Parameters + ---------- + stream : NativeFile + The stream to wrap with the buffer + buffer_size : int + Size of the buffer that should be added. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ def __init__(self, NativeFile stream, int buffer_size, MemoryPool memory_pool=None): @@ -1724,6 +1748,12 @@ cdef class Codec(_Weakrefable): """ Returns true if the compression level parameter is supported for the given codec. + + Parameters + ---------- + compression : str + Type of compression codec, valid values are: gzip, bz2, brotli, + lz4, zstd and snappy. """ cdef CCompressionType typ = _ensure_compression(compression) return CCodec.SupportsCompressionLevel(typ) @@ -1733,6 +1763,12 @@ cdef class Codec(_Weakrefable): """ Returns the compression level that Arrow will use for the codec if None is specified. + + Parameters + ---------- + compression : str + Type of compression codec, valid values are: gzip, bz2, brotli, + lz4, zstd and snappy. """ cdef CCompressionType typ = _ensure_compression(compression) return GetResultValue(CCodec.DefaultCompressionLevel(typ)) @@ -1741,6 +1777,12 @@ cdef class Codec(_Weakrefable): def minimum_compression_level(str compression not None): """ Returns the smallest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, valid values are: gzip, bz2, brotli, + lz4, zstd and snappy. """ cdef CCompressionType typ = _ensure_compression(compression) return GetResultValue(CCodec.MinimumCompressionLevel(typ)) @@ -1749,6 +1791,12 @@ cdef class Codec(_Weakrefable): def maximum_compression_level(str compression not None): """ Returns the largest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, valid values are: gzip, bz2, brotli, + lz4, zstd and snappy. """ cdef CCompressionType typ = _ensure_compression(compression) return GetResultValue(CCodec.MaximumCompressionLevel(typ)) diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 61ac4250a60..5a29a8c218b 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -782,6 +782,11 @@ cdef class _RecordBatchFileReader(_Weakrefable): def get_tensor_size(Tensor tensor): """ Return total size of serialized Tensor including metadata and padding. + + Parameters + ---------- + tensor : Tensor + The tensor for which we want to known the size. """ cdef int64_t size with nogil: @@ -792,6 +797,11 @@ def get_tensor_size(Tensor tensor): def get_record_batch_size(RecordBatch batch): """ Return total size of serialized RecordBatch including metadata and padding. + + Parameters + ---------- + batch : RecordBatch + The recordbatch for which we want to know the size. """ cdef int64_t size with nogil: diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 9bc24ceb473..6b62829ba7e 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -38,6 +38,16 @@ strides: {0.strides}""".format(self) @staticmethod def from_numpy(obj, dim_names=None): + """ + Create a Tensor from a numpy array. + + Parameters + ---------- + obj : numpy.ndarray + The source numpy array + dim_names : list + Names of each dimension of the Tensor. + """ cdef: vector[c_string] c_dim_names shared_ptr[CTensor] ctensor @@ -160,6 +170,17 @@ shape: {0.shape}""".format(self) def from_numpy(data, coords, shape, dim_names=None): """ Create arrow::SparseCOOTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the rows. + coords : numpy.ndarray + Coordinates of the data. + shape : tuple + Shape of the tensor. + dim_names : list + Names of the dimensions. """ cdef shared_ptr[CSparseCOOTensor] csparse_tensor cdef vector[int64_t] c_shape @@ -186,6 +207,13 @@ shape: {0.shape}""".format(self) def from_scipy(obj, dim_names=None): """ Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. """ import scipy.sparse if not isinstance(obj, scipy.sparse.coo_matrix): @@ -225,6 +253,13 @@ shape: {0.shape}""".format(self) def from_pydata_sparse(obj, dim_names=None): """ Convert pydata/sparse.COO to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : pydata.sparse.COO + The object that should be converted. + dim_names : list, optional + Names of the dimensions. """ import sparse if not isinstance(obj, sparse.COO): @@ -252,6 +287,11 @@ shape: {0.shape}""".format(self) def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : Tensor + The tensor that should be converted. """ cdef shared_ptr[CSparseCOOTensor] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) @@ -395,13 +435,34 @@ shape: {0.shape}""".format(self) def from_dense_numpy(cls, obj, dim_names=None): """ Convert numpy.ndarray to arrow::SparseCSRMatrix + + Parameters + ---------- + obj : numpy.ndarray + The source numpy array + dim_names : list, optional + The names of the dimensions. """ return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) @staticmethod def from_numpy(data, indptr, indices, shape, dim_names=None): """ - Create arrow::SparseCSRMatrix from numpy.ndarrays + Create arrow::SparseCSRMatrix from numpy.ndarrays. + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the rows. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list + Names of the dimensions. """ cdef shared_ptr[CSparseCSRMatrix] csparse_tensor cdef vector[int64_t] c_shape @@ -432,6 +493,13 @@ shape: {0.shape}""".format(self) def from_scipy(obj, dim_names=None): """ Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. """ import scipy.sparse if not isinstance(obj, scipy.sparse.csr_matrix): @@ -462,6 +530,11 @@ shape: {0.shape}""".format(self) def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : Tensor + The tensor that should be converted. """ cdef shared_ptr[CSparseCSRMatrix] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) @@ -585,6 +658,20 @@ shape: {0.shape}""".format(self) def from_numpy(data, indptr, indices, shape, dim_names=None): """ Create arrow::SparseCSCMatrix from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the rows. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list + Names of the dimensions. """ cdef shared_ptr[CSparseCSCMatrix] csparse_tensor cdef vector[int64_t] c_shape @@ -615,6 +702,13 @@ shape: {0.shape}""".format(self) def from_scipy(obj, dim_names=None): """ Convert scipy.sparse.csc_matrix to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : scipy.sparse.csc_matrix + The SciPy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. """ import scipy.sparse if not isinstance(obj, scipy.sparse.csc_matrix): @@ -645,6 +739,11 @@ shape: {0.shape}""".format(self) def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : Tensor + The tensor that should be converted. """ cdef shared_ptr[CSparseCSCMatrix] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) @@ -771,6 +870,22 @@ shape: {0.shape}""".format(self) dim_names=None): """ Create arrow::SparseCSFTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the rows. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + axis_order : list, optional + The order of the axis. + dim_names : list, optional + Names of the dimensions. """ cdef shared_ptr[CSparseCSFTensor] csparse_tensor cdef vector[int64_t] c_axis_order @@ -817,6 +932,11 @@ shape: {0.shape}""".format(self) def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCSFTensor + + Parameters + ---------- + obj : Tensor + The tensor that should be converted. """ cdef shared_ptr[CSparseCSFTensor] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index cf98486c70c..fb8a9405dd9 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2807,14 +2807,38 @@ def from_numpy_dtype(object dtype): def is_boolean_value(object obj): + """ + Check if the object is a boolean. + + Parameters + ---------- + obj : object + The object to check + """ return IsPyBool(obj) def is_integer_value(object obj): + """ + Check if the object is an integer. + + Parameters + ---------- + obj : object + The object to check + """ return IsPyInt(obj) def is_float_value(object obj): + """ + Check if the object is a float. + + Parameters + ---------- + obj : object + The object to check + """ return IsPyFloat(obj) From 95bf51917f22bb2857d62b1361961b5a92256db0 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Tue, 28 Sep 2021 16:51:53 +0200 Subject: [PATCH 10/25] More docstrings --- python/pyarrow/_compute.pyx | 16 +++++++++++ python/pyarrow/_cuda.pyx | 3 ++- python/pyarrow/array.pxi | 9 ++++--- python/pyarrow/io.pxi | 46 ++++++++++++++++++++++++++------ python/pyarrow/ipc.pxi | 23 ++++++++++++++++ python/pyarrow/scalar.pxi | 4 +-- python/pyarrow/serialization.pxi | 4 +-- python/pyarrow/table.pxi | 8 +++--- python/pyarrow/types.pxi | 25 ++++++++++++----- 9 files changed, 112 insertions(+), 26 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 44afd4ba0bf..a857e40d254 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -670,6 +670,14 @@ class CastOptions(_CastOptions): @staticmethod def safe(target_type=None): + """" + Cast operation options. + + Parameters + ---------- + target_type : optional + Target type for the safe cast. + """ self = CastOptions() self._set_safe() self._set_type(target_type) @@ -677,6 +685,14 @@ class CastOptions(_CastOptions): @staticmethod def unsafe(target_type=None): + """" + Cast operation options. + + Parameters + ---------- + target_type : optional + Target type for the unsafe cast. + """ self = CastOptions() self._set_unsafe() self._set_type(target_type) diff --git a/python/pyarrow/_cuda.pyx b/python/pyarrow/_cuda.pyx index f4ca7639885..1b66b95089a 100644 --- a/python/pyarrow/_cuda.pyx +++ b/python/pyarrow/_cuda.pyx @@ -187,7 +187,8 @@ cdef class Context(_Weakrefable): return pyarrow_wrap_cudabuffer(cudabuf) def foreign_buffer(self, address, size, base=None): - """Create device buffer from address and size as a view. + """ + Create device buffer from address and size as a view. The caller is responsible for allocating and freeing the memory. When `address==size==0` then a new zero-sized buffer diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 6d5bee6584c..8dba99b6508 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -878,7 +878,7 @@ cdef class Array(_PandasConvertible): Parameters ---------- - sequence : ndarray, pandas.Series, array-like + obj : ndarray, pandas.Series, array-like mask : array (boolean), optional Indicate which values are null (True) or not null (False). type : pyarrow.DataType @@ -1618,6 +1618,7 @@ cdef class ListArray(BaseListArray): ---------- offsets : Array (int32 type) values : Array (any type) + pool : MemoryPool Returns ------- @@ -1699,6 +1700,7 @@ cdef class LargeListArray(BaseListArray): ---------- offsets : Array (int64 type) values : Array (any type) + pool : MemoryPool Returns ------- @@ -1748,6 +1750,7 @@ cdef class MapArray(Array): offsets : array-like or sequence (int32 type) keys : array-like or sequence (any type) items : array-like or sequence (any type) + pool : MemoryPool Returns ------- @@ -2322,9 +2325,9 @@ cdef class ExtensionArray(Array): Parameters ---------- - typ: DataType + typ : DataType The extension type for the result array. - storage: Array + storage : Array The underlying storage for the result array. Returns diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 29416190d31..68574783a2f 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -759,6 +759,16 @@ cdef class MemoryMappedFile(NativeFile): @staticmethod def create(path, size): + """ + Create a MemoryMappedFile + + Parameters + ---------- + path : str + Where to create the file. + size : int + Size of the memory mapped file. + """ cdef: shared_ptr[CMemoryMappedFile] handle c_string c_path = encode_file_path(path) @@ -1430,6 +1440,14 @@ cdef void _cb_transform(transform_func, const shared_ptr[CBuffer]& src, cdef class TransformInputStream(NativeFile): + """ + Transform and input stream. + + stream : NativeFile + The stream to transform. + transform_func : callable + The transformation to apply. + """ def __init__(self, NativeFile stream, transform_func): self.set_input_stream(TransformInputStream.make_native( @@ -1502,6 +1520,18 @@ def foreign_buffer(address, size, base=None): The *base* object will be kept alive as long as this buffer is alive, including across language boundaries (for example if the buffer is referenced by C++ code). + + Parameters + ---------- + address : int + Specify the starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + Specify the size of device buffer in bytes. + base : {None, object} + Specify object that owns the referenced memory. """ cdef: intptr_t c_addr = address @@ -1649,7 +1679,7 @@ cdef class Codec(_Weakrefable): Type of compression codec to initialize, valid values are: 'gzip', 'bz2', 'brotli', 'lz4' (or 'lz4_frame'), 'lz4_raw', 'zstd' and 'snappy'. - compression_level: int, None + compression_level : int, None Optional parameter specifying how aggressively to compress. The possible ranges and effect of this parameter depend on the specific codec chosen. Higher values compress more but typically use more @@ -1732,7 +1762,7 @@ cdef class Codec(_Weakrefable): Parameters ---------- - compression: str + compression : str Type of compression codec, valid values are: gzip, bz2, brotli, lz4, zstd and snappy. @@ -1984,15 +2014,15 @@ def input_stream(source, compression='detect', buffer_size=None): Parameters ---------- - source: str, Path, buffer, file-like object, ... + source : str, Path, buffer, file-like object, ... The source to open for reading. - compression: str optional, default 'detect' + compression : str optional, default 'detect' The compression algorithm to use for on-the-fly decompression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: int, default None + buffer_size : int, default None If None or 0, no buffering will happen. Otherwise the size of the temporary read buffer. """ @@ -2036,15 +2066,15 @@ def output_stream(source, compression='detect', buffer_size=None): Parameters ---------- - source: str, Path, buffer, file-like object, ... + source : str, Path, buffer, file-like object, ... The source to open for writing. - compression: str optional, default 'detect' + compression : str optional, default 'detect' The compression algorithm to use for on-the-fly compression. If "detect" and source is a file path, then compression will be chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). - buffer_size: int, default None + buffer_size : int, default None If None or 0, no buffering will happen. Otherwise the size of the temporary write buffer. """ diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 5a29a8c218b..9304bbb9781 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -54,6 +54,14 @@ _WriteStats = namedtuple( class WriteStats(_WriteStats): """IPC write statistics + + Parameters + ---------- + num_messages : number of messages. + num_record_batches : number of record batches. + num_dictionary_batches : number of dictionary batches. + num_dictionary_deltas : delta of dictionaries. + num_replaced_dictionaries : number of replaced dictionaries. """ __slots__ = () @@ -73,6 +81,14 @@ _ReadStats = namedtuple( class ReadStats(_ReadStats): """IPC read statistics + + Parameters + ---------- + num_messages : number of messages. + num_record_batches : number of record batches. + num_dictionary_batches : number of dictionary batches. + num_dictionary_deltas : delta of dictionaries. + num_replaced_dictionaries : number of replaced dictionaries. """ __slots__ = () @@ -311,6 +327,13 @@ cdef class MessageReader(_Weakrefable): @staticmethod def open_stream(source): + """ + Open stream from source. + + Parameters + ---------- + source : a readable source, like an InputStream + """ cdef: MessageReader result = MessageReader.__new__(MessageReader) shared_ptr[CInputStream] in_stream diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 4a345878699..a3061655851 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -873,9 +873,9 @@ cdef class ExtensionScalar(Scalar): Parameters ---------- - typ: DataType + typ : DataType The extension type for the result scalar. - value: object + value : object The storage value for the result scalar. Returns diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi index 9177b2aa27b..248c5d6b614 100644 --- a/python/pyarrow/serialization.pxi +++ b/python/pyarrow/serialization.pxi @@ -478,9 +478,9 @@ def deserialize_from(source, object base, SerializationContext context=None): Parameters ---------- - source: NativeFile + source : NativeFile File to read the sequence from. - base: object + base : object This object will be the base object of all the numpy arrays contained in the sequence. context : SerializationContext diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index f22e0e4f586..9ef57439500 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1006,8 +1006,8 @@ cdef class RecordBatch(_PandasConvertible): Parameters ---------- - df: pandas.DataFrame - schema: pyarrow.Schema, optional + df : pandas.DataFrame + schema : pyarrow.Schema, optional The expected schema of the RecordBatch. This can be used to indicate the type of columns if we cannot infer it automatically. If passed, the output will have exactly this schema. Columns @@ -1043,7 +1043,7 @@ cdef class RecordBatch(_PandasConvertible): Parameters ---------- - arrays: list of pyarrow.Array + arrays : list of pyarrow.Array One for each field in RecordBatch names : list of str, optional Names for the batch fields. If not passed, schema must be passed @@ -2283,7 +2283,7 @@ def concat_tables(tables, c_bool promote=False, MemoryPool memory_pool=None): ---------- tables : iterable of pyarrow.Table objects Pyarrow tables to concatenate into a single Table. - promote: bool, default False + promote : bool, default False If True, concatenate tables with null-filling and null type promotion. memory_pool : MemoryPool, default None For memory allocations, if required, otherwise use default pool. diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index fb8a9405dd9..f9960b568ea 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -751,6 +751,11 @@ cdef class BaseExtensionType(DataType): cdef class ExtensionType(BaseExtensionType): """ Concrete base class for Python-defined extension types. + + Parameters + ---------- + storage_type : DataType + extension_name : str """ def __cinit__(self): @@ -764,11 +769,6 @@ cdef class ExtensionType(BaseExtensionType): This should be called at the end of the subclass' ``__init__`` method. - - Parameters - ---------- - storage_type : DataType - extension_name : str """ cdef: shared_ptr[CExtensionType] cpy_ext_type @@ -941,7 +941,16 @@ def unregister_extension_type(type_name): cdef class KeyValueMetadata(_Metadata, Mapping): + """ + KeyValueMetadata + Parameters + ---------- + __arg0__ : dict + A dict of the key-value metadata + **kwargs : optional + additional key-value metadata + """ def __init__(self, __arg0__=None, **kwargs): cdef: vector[c_string] keys, values @@ -2741,7 +2750,7 @@ def schema(fields, metadata=None): Parameters ---------- - field : iterable of Fields or tuples, or mapping of strings to DataTypes + fields : iterable of Fields or tuples, or mapping of strings to DataTypes metadata : dict, default None Keys and values must be coercible to bytes. @@ -2797,6 +2806,10 @@ def schema(fields, metadata=None): def from_numpy_dtype(object dtype): """ Convert NumPy dtype to pyarrow.DataType. + + Parameters + ---------- + dtype : the numpy dtype to convert """ cdef shared_ptr[CDataType] c_type dtype = np.dtype(dtype) From ffb873681ccfda009696d9379064758000d13aeb Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 29 Sep 2021 12:19:58 +0200 Subject: [PATCH 11/25] Only compute functions pending --- python/pyarrow/array.pxi | 2 +- python/pyarrow/error.pxi | 2 +- python/pyarrow/io.pxi | 21 +++++++++++++++++++++ python/pyarrow/lib.pyx | 5 +++++ python/pyarrow/memory.pxi | 18 ++++++++++++++++++ python/pyarrow/serialization.pxi | 10 +++++----- python/pyarrow/types.pxi | 17 +++++++++++++++++ 7 files changed, 68 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 8dba99b6508..c9a4f3efb5e 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -388,7 +388,7 @@ def repeat(value, size, MemoryPool memory_pool=None): Parameters ---------- - value: Scalar-like object + value : Scalar-like object Either a pyarrow.Scalar or any python object coercible to a Scalar. size : int Number of times to repeat the scalar in the output Array. diff --git a/python/pyarrow/error.pxi b/python/pyarrow/error.pxi index 882427f32ea..0280016288a 100644 --- a/python/pyarrow/error.pxi +++ b/python/pyarrow/error.pxi @@ -163,7 +163,7 @@ def enable_signal_handlers(c_bool enable): Parameters ---------- - enable: bool + enable : bool Whether to enable user interruption by setting a temporary signal handler. """ diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 68574783a2f..0a114607c03 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1443,6 +1443,8 @@ cdef class TransformInputStream(NativeFile): """ Transform and input stream. + Parameters + ---------- stream : NativeFile The stream to transform. transform_func : callable @@ -1478,6 +1480,20 @@ class Transcoder: def transcoding_input_stream(stream, src_encoding, dest_encoding): + """ + Add a transcoding transformation to the stream. + Incoming data will be decoded according to ``src_encoding`` and + emitted data will be encoded according to ``dest_encoding``. + + Parameters + ---------- + stream : NativeFile + The stream to which the transformation should be applied. + src_encoding : str + The codec to use when reading data data. + dest_encoding : str + The codec to use for emitted data. + """ src_codec = codecs.lookup(src_encoding) dest_codec = codecs.lookup(dest_encoding) if src_codec.name == dest_codec.name: @@ -1506,6 +1522,11 @@ cdef shared_ptr[CInputStream] native_transcoding_input_stream( def py_buffer(object obj): """ Construct an Arrow buffer from a Python bytes-like or buffer-like object + + Parameters + ---------- + obj : object + the object from which the buffer should be constructed. """ cdef shared_ptr[CBuffer] buf buf = GetResultValue(PyBuffer.FromPyObject(obj)) diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 60e1f8c53bb..7464d6c95bd 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -60,6 +60,11 @@ def set_cpu_count(int count): """ Set the number of threads to use in parallel operations. + Parameters + ---------- + count : int + The number of concurrent cpu that should be set. + See Also -------- cpu_count : Get the size of this pool. diff --git a/python/pyarrow/memory.pxi b/python/pyarrow/memory.pxi index fc0d32aad56..8ccb3505842 100644 --- a/python/pyarrow/memory.pxi +++ b/python/pyarrow/memory.pxi @@ -123,6 +123,11 @@ def proxy_memory_pool(MemoryPool parent): """ Create and return a MemoryPool instance that redirects to the *parent*, but with separate allocation statistics. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. """ cdef ProxyMemoryPool out = ProxyMemoryPool.__new__(ProxyMemoryPool) out.proxy_pool.reset(new CProxyMemoryPool(parent.pool)) @@ -134,6 +139,11 @@ def logging_memory_pool(MemoryPool parent): """ Create and return a MemoryPool instance that redirects to the *parent*, but also dumps allocation logs on stderr. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. """ cdef LoggingMemoryPool out = LoggingMemoryPool.__new__( LoggingMemoryPool, parent) @@ -181,6 +191,14 @@ def mimalloc_memory_pool(): def set_memory_pool(MemoryPool pool): + """ + Set the default memory pool. + + Parameters + ---------- + pool : MemoryPool + The memory pool that should be used by default. + """ c_set_default_memory_pool(pool.pool) diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi index 248c5d6b614..c03721578a9 100644 --- a/python/pyarrow/serialization.pxi +++ b/python/pyarrow/serialization.pxi @@ -373,7 +373,7 @@ def serialize(object value, SerializationContext context=None): Parameters ---------- - value: object + value : object Python object for the sequence that is to be serialized. context : SerializationContext Custom serialization and deserialization context, uses a default @@ -412,9 +412,9 @@ def serialize_to(object value, sink, SerializationContext context=None): Parameters ---------- - value: object + value : object Python object for the sequence that is to be serialized. - sink: NativeFile or file-like + sink : NativeFile or file-like File the sequence will be written to. context : SerializationContext Custom serialization and deserialization context, uses a default @@ -437,9 +437,9 @@ def read_serialized(source, base=None): Parameters ---------- - source: NativeFile + source : NativeFile File to read the sequence from. - base: object + base : object This object will be the base object of all the numpy arrays contained in the sequence. diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index f9960b568ea..26e9752f457 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -841,6 +841,11 @@ cdef class PyExtensionType(ExtensionType): """ Concrete base class for Python-defined extension types based on pickle for (de)serialization. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. """ def __cinit__(self): @@ -880,6 +885,13 @@ cdef class UnknownExtensionType(PyExtensionType): """ A concrete class for Python-defined extension types that refer to an unknown Python implementation. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. + serialized : bytes + The serialised output. """ cdef: @@ -2718,6 +2730,11 @@ def type_for_alias(name): """ Return DataType given a string alias if one exists. + Parameters + ---------- + name : str + The alias of the DataType that should be retrieved. + Returns ------- type : DataType From f5ed74b694bf542621b8d1f2a94e0bb7fabe0979 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 29 Sep 2021 14:03:53 +0200 Subject: [PATCH 12/25] Remaining modules --- python/pyarrow/_compute.pyx | 29 +++++++++++++++++++++++++++++ python/pyarrow/compute.py | 4 ++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index a857e40d254..54dbea43354 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -463,6 +463,11 @@ cdef class FunctionRegistry(_Weakrefable): def get_function(self, name): """ Look up a function by name in the registry. + + Parameters + ---------- + name : str + The name of the function to lookup """ cdef: c_string c_name = tobytes(name) @@ -485,6 +490,11 @@ def get_function(name): The function is looked up in the global registry (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to lookup """ return _global_func_registry.get_function(name) @@ -502,6 +512,17 @@ def call_function(name, args, options=None, memory_pool=None): The function is looked up in the global registry (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to call. + args : list + The arguments to the function. + options : optional + options provided to the function. + memory_pool : MemoryPool, optional + memory pool to use for allocations during function. """ func = _global_func_registry.get_function(name) return func.call(args, options=options, memory_pool=memory_pool) @@ -524,6 +545,14 @@ cdef class FunctionOptions(_Weakrefable): @staticmethod def deserialize(buf): + """ + Deserialize options for a function. + + Parameters + ---------- + buf : Buffer + The buffer containing the data to deserialize. + """ cdef: shared_ptr[CBuffer] c_buf = pyarrow_unwrap_buffer(buf) CResult[unique_ptr[CFunctionOptions]] maybe_options = \ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 53c49fb5cda..80c6614fc9c 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -634,10 +634,10 @@ def fill_null(values, fill_value): Parameters ---------- - data : Array, ChunkedArray, or Scalar-like object + values : Array, ChunkedArray, or Scalar-like object Each null element is replaced with the corresponding value from fill_value. - fill_value: Array, ChunkedArray, or Scalar-like object + fill_value : Array, ChunkedArray, or Scalar-like object If not same type as data will attempt to cast. Returns From b6f36aebf8a78108defa640f0c44c84033be108e Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 29 Sep 2021 14:05:25 +0200 Subject: [PATCH 13/25] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- python/pyarrow/parquet.py | 2 +- python/pyarrow/tensor.pxi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 682c44e2e25..680d20093de 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -215,7 +215,7 @@ class ParquetFile: high-latency filesystems (e.g. S3). If True, Arrow will use a background I/O thread pool. read_dictionary : list - List of names to read directly as DictionaryArray + List of names to read directly as DictionaryArray. coerce_int96_timestamp_unit : str, default None. Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 6b62829ba7e..18cd797c540 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -45,7 +45,7 @@ strides: {0.strides}""".format(self) ---------- obj : numpy.ndarray The source numpy array - dim_names : list + dim_names : list, optional Names of each dimension of the Tensor. """ cdef: From 472339a54d56aeab6f3ff1b43c6b9b05cade58fb Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 29 Sep 2021 14:39:40 +0200 Subject: [PATCH 14/25] More docstrings for dataset and fs --- python/pyarrow/_dataset.pyx | 94 +++++++++++++++++++++++++++++++++++-- python/pyarrow/_fs.pyx | 76 ++++++++++++++++++++++++------ python/pyarrow/_s3fs.pyx | 39 +++++++++------ python/pyarrow/fs.py | 6 +++ 4 files changed, 182 insertions(+), 33 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 5a906a32ade..c1b240b8a82 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -569,7 +569,8 @@ cdef class InMemoryDataset(Dataset): cdef class UnionDataset(Dataset): - """A Dataset wrapping child datasets. + """ + A Dataset wrapping child datasets. Children's schemas must agree with the provided schema. @@ -2791,7 +2792,14 @@ cdef class RecordBatchIterator(_Weakrefable): class TaggedRecordBatch(collections.namedtuple( "TaggedRecordBatch", ["record_batch", "fragment"])): - """A combination of a record batch and the fragment it came from.""" + """ + A combination of a record batch and the fragment it came from. + + Parameters + ---------- + record_batch : The record batch. + fragment : fragment of the record batch. + """ cdef class TaggedRecordBatchIterator(_Weakrefable): @@ -2945,6 +2953,32 @@ cdef class Scanner(_Weakrefable): object columns=None, Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE, FragmentScanOptions fragment_scan_options=None): + """ + Create Scanner from Dataset + + Parameters + ---------- + dataset : Dataset + Dataset to scan. + columns : list of str or dict, default None + The columns to project. + filter : Expression, default None + Scan will return only the rows matching the filter. + batch_size : int, default 1M + The maximum row count for scanned record batches. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + use_async : bool, default False + If enabled, an async scanner will be used that should offer + better performance with high-latency/highly-parallel filesystems + (e.g. S3) + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + fragment_scan_options : FragmentScanOptions + The fragment scan options. + """ cdef: shared_ptr[CScanOptions] options = make_shared[CScanOptions]() shared_ptr[CScannerBuilder] builder @@ -2966,6 +3000,34 @@ cdef class Scanner(_Weakrefable): object columns=None, Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE, FragmentScanOptions fragment_scan_options=None): + """ + Create Scanner from Fragment + + Parameters + ---------- + fragment : Fragment + fragment to scan. + schema : Schema + The schema of the fragment. + columns : list of str or dict, default None + The columns to project. + filter : Expression, default None + Scan will return only the rows matching the filter. + batch_size : int, default 1M + The maximum row count for scanned record batches. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + use_async : bool, default False + If enabled, an async scanner will be used that should offer + better performance with high-latency/highly-parallel filesystems + (e.g. S3) + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + fragment_scan_options : FragmentScanOptions + The fragment scan options. + """ cdef: shared_ptr[CScanOptions] options = make_shared[CScanOptions]() shared_ptr[CScannerBuilder] builder @@ -2990,12 +3052,38 @@ cdef class Scanner(_Weakrefable): Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE, FragmentScanOptions fragment_scan_options=None): - """Create a Scanner from an iterator of batches. + """ + Create a Scanner from an iterator of batches. This creates a scanner which can be used only once. It is intended to support writing a dataset (which takes a scanner) from a source which can be read only once (e.g. a RecordBatchReader or generator). + + Parameters + ---------- + source : Iterator + The iterator of Batches. + schema : Schema + The schema of the batches. + columns : list of str or dict, default None + The columns to project. + filter : Expression, default None + Scan will return only the rows matching the filter. + batch_size : int, default 1M + The maximum row count for scanned record batches. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + use_async : bool, default False + If enabled, an async scanner will be used that should offer + better performance with high-latency/highly-parallel filesystems + (e.g. S3) + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + fragment_scan_options : FragmentScanOptions + The fragment scan options. """ cdef: shared_ptr[CScanOptions] options = make_shared[CScanOptions]() diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 5e55afcd408..945ffdf85af 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -934,42 +934,57 @@ class FileSystemHandler(ABC): """ Implement PyFileSystem.type_name. """ - get_type_name.__doc__ = FileSystem.get_type_name.__doc__ @abstractmethod def get_file_info(self, paths): """ Implement PyFileSystem.get_file_info(paths). + + Parameters + ---------- + paths : paths for which we want to retrieve the info. """ - get_file_info.__doc__ = FileSystem.get_file_info.__doc__ @abstractmethod def get_file_info_selector(self, selector): """ Implement PyFileSystem.get_file_info(selector). + + Parameters + ---------- + selector : selector for which we want to retrieve the info. """ - get_file_info_selector.__doc__ = FileSystem.get_file_info_selector.__doc__ @abstractmethod def create_dir(self, path, recursive): """ Implement PyFileSystem.create_dir(...). + + Parameters + ---------- + path : path of the directory. + recursive : if the parent directories should be created too. """ - create_dir.__doc__ = FileSystem.create_dir.__doc__ @abstractmethod def delete_dir(self, path): """ Implement PyFileSystem.delete_dir(...). + + Parameters + ---------- + path : path of the directory. """ - delete_dir.__doc__ = FileSystem.delete_dir.__doc__ @abstractmethod def delete_dir_contents(self, path): """ Implement PyFileSystem.delete_dir_contents(...). + + Parameters + ---------- + path : path of the directory. """ - delete_dir_contents.__doc__ = FileSystem.delete_dir_contents.__doc__ @abstractmethod def delete_root_dir_contents(self): @@ -981,58 +996,89 @@ class FileSystemHandler(ABC): def delete_file(self, path): """ Implement PyFileSystem.delete_file(...). + + Parameters + ---------- + path : path of the file. """ - delete_file.__doc__ = FileSystem.delete_file.__doc__ @abstractmethod def move(self, src, dest): """ Implement PyFileSystem.move(...). + + Parameters + ---------- + src : path of what should be moved. + dest : path of where it should be moved to. """ - move.__doc__ = FileSystem.move.__doc__ @abstractmethod def copy_file(self, src, dest): """ Implement PyFileSystem.copy_file(...). + + Parameters + ---------- + src : path of what should be copied. + dest : path of where it should be copied to. """ - copy_file.__doc__ = FileSystem.copy_file.__doc__ @abstractmethod def open_input_stream(self, path): """ Implement PyFileSystem.open_input_stream(...). + + Parameters + ---------- + path : path of what should be opened. """ - open_input_stream.__doc__ = FileSystem.open_input_stream.__doc__ @abstractmethod def open_input_file(self, path): """ Implement PyFileSystem.open_input_file(...). + + Parameters + ---------- + path : path of what should be opened. """ - open_input_file.__doc__ = FileSystem.open_input_file.__doc__ @abstractmethod def open_output_stream(self, path, metadata): """ Implement PyFileSystem.open_output_stream(...). + + Parameters + ---------- + path : path of what should be opened. + metadata : mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). """ - open_output_stream.__doc__ = FileSystem.open_output_stream.__doc__ @abstractmethod def open_append_stream(self, path, metadata): """ Implement PyFileSystem.open_append_stream(...). + + Parameters + ---------- + path : path of what should be opened. + metadata : mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). """ - open_append_stream.__doc__ = FileSystem.open_append_stream.__doc__ @abstractmethod def normalize_path(self, path): """ Implement PyFileSystem.normalize_path(...). - """ - normalize_path.__doc__ = FileSystem.normalize_path.__doc__ + Parameters + ---------- + path : path of what should be normalized. + """ # Callback definitions for CPyFileSystemVtable diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index a45be28d726..5829d74d31f 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -37,6 +37,14 @@ cpdef enum S3LogLevel: def initialize_s3(S3LogLevel log_level=S3LogLevel.Fatal): + """ + Initialize S3 support + + Parameters + ---------- + log_level : S3LogLevel + level of logging + """ cdef CS3GlobalOptions options options.log_level = log_level check_status(CInitializeS3(options)) @@ -47,7 +55,8 @@ def finalize_s3(): cdef class S3FileSystem(FileSystem): - """S3-backed FileSystem implementation + """ + S3-backed FileSystem implementation If neither access_key nor secret_key are provided, and role_arn is also not provided, then attempts to initialize from AWS environment variables, @@ -62,43 +71,43 @@ cdef class S3FileSystem(FileSystem): Parameters ---------- - access_key: str, default None + access_key : str, default None AWS Access Key ID. Pass None to use the standard AWS environment variables and/or configuration file. - secret_key: str, default None + secret_key : str, default None AWS Secret Access key. Pass None to use the standard AWS environment variables and/or configuration file. - session_token: str, default None + session_token : str, default None AWS Session Token. An optional session token, required if access_key and secret_key are temporary credentials from STS. - anonymous: boolean, default False + anonymous : boolean, default False Whether to connect anonymously if access_key and secret_key are None. If true, will not attempt to look up credentials using standard AWS configuration methods. - role_arn: str, default None + role_arn : str, default None AWS Role ARN. If provided instead of access_key and secret_key, temporary credentials will be fetched by assuming this role. - session_name: str, default None + session_name : str, default None An optional identifier for the assumed role session. - external_id: str, default None + external_id : str, default None An optional unique identifier that might be required when you assume a role in another account. - load_frequency: int, default 900 + load_frequency : int, default 900 The frequency (in seconds) with which temporary credentials from an assumed role session will be refreshed. - region: str, default 'us-east-1' + region : str, default 'us-east-1' AWS region to connect to. - scheme: str, default 'https' + scheme : str, default 'https' S3 connection transport scheme. - endpoint_override: str, default None + endpoint_override : str, default None Override region with a connect string such as "localhost:9000" - background_writes: boolean, default True + background_writes : boolean, default True Whether file writes will be issued in the background, without blocking. - default_metadata: mapping or KeyValueMetadata, default None + default_metadata : mapping or KeyValueMetadata, default None Default metadata for open_output_stream. This will be ignored if non-empty metadata is passed to open_output_stream. - proxy_options: dict or str, default None + proxy_options : dict or str, default None If a proxy is used, provide the options here. Supported options are: 'scheme' (str: 'http' or 'https'; required), 'host' (str; required), 'port' (int; required), 'username' (str; optional), diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 80ee536d44c..778c37436de 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -256,6 +256,12 @@ class FSSpecHandler(FileSystemHandler): https://filesystem-spec.readthedocs.io/en/latest/index.html + Parameters + ---------- + fs : The file system implementation according to FSSpec. + + Example + ------- >>> PyFileSystem(FSSpecHandler(fsspec_fs)) """ From 0eea3599d975495e7d7aa14d1360a839d3a4b892 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 29 Sep 2021 16:05:26 +0200 Subject: [PATCH 15/25] More work on docstrings --- python/pyarrow/_dataset.pyx | 43 +++++++++++++++++++++------- python/pyarrow/tests/test_compute.py | 11 ++++--- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index c1b240b8a82..9a2ce8b32d1 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -508,12 +508,13 @@ cdef class Dataset(_Weakrefable): cdef class InMemoryDataset(Dataset): - """A Dataset wrapping in-memory data. + """ + A Dataset wrapping in-memory data. Parameters ---------- - source - The data for this dataset. Can be a RecordBatch, Table, list of + source : The data for this dataset. + Can be a RecordBatch, Table, list of RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader. If an iterable is provided, the schema must also be provided. schema : Schema, optional @@ -612,7 +613,8 @@ cdef class UnionDataset(Dataset): cdef class FileSystemDataset(Dataset): - """A Dataset of file fragments. + """ + A Dataset of file fragments. A FileSystemDataset is composed of one or more FileFragment. @@ -1155,7 +1157,15 @@ cdef class FileFragment(Fragment): class RowGroupInfo: - """A wrapper class for RowGroup information""" + """ + A wrapper class for RowGroup information + + Parameters + ---------- + id : the group id. + metadata : the rowgroup metadata. + schema : schema of the rows. + """ def __init__(self, id, metadata, schema): self.id = id @@ -1535,6 +1545,18 @@ cdef set _PARQUET_READ_OPTIONS = { cdef class ParquetFileFormat(FileFormat): + """ + FileFormat for Parquet + + Parameters + ---------- + read_options : ParquetReadOptions + Read options for the file. + default_fragment_scan_options : ParquetFragmentScanOptions + Scan Options for the file. + **kwargs : dict + Additional options for read option or scan option. + """ cdef: CParquetFileFormat* parquet_format @@ -1669,7 +1691,8 @@ cdef class ParquetFileFormat(FileFormat): cdef class ParquetFragmentScanOptions(FragmentScanOptions): - """Scan-specific options for Parquet fragments. + """ + Scan-specific options for Parquet fragments. Parameters ---------- @@ -2330,7 +2353,7 @@ cdef class DatasetFactory(_Weakrefable): shared_ptr[CDatasetFactory] wrapped CDatasetFactory* factory - def __init__(self, list children): + def __init__(self): _forbid_instantiation(self.__class__) cdef init(self, const shared_ptr[CDatasetFactory]& sp): @@ -2388,7 +2411,7 @@ cdef class DatasetFactory(_Weakrefable): Parameters ---------- - schema: Schema, default None + schema : Schema, default None The schema to conform the source to. If None, the inspected schema is used. @@ -2423,7 +2446,7 @@ cdef class FileSystemFactoryOptions(_Weakrefable): partition_base_dir prefix will be skipped for partitioning discovery. The ignored files will still be part of the Dataset, but will not have partition information. - partitioning: Partitioning/PartitioningFactory, optional + partitioning : Partitioning/PartitioningFactory, optional Apply the Partitioning to every discovered Fragment. See Partitioning or PartitioningFactory documentation. exclude_invalid_files : bool, optional (default True) @@ -2533,7 +2556,7 @@ cdef class FileSystemDatasetFactory(DatasetFactory): ---------- filesystem : pyarrow.fs.FileSystem Filesystem to discover. - paths_or_selector: pyarrow.fs.Selector or list of path-likes + paths_or_selector : pyarrow.fs.Selector or list of path-likes Either a Selector object or a list of path-like objects. format : FileFormat Currently only ParquetFileFormat and IpcFileFormat are supported. diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 579b56b64e7..1ac3c30a381 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -650,10 +650,13 @@ def test_generated_docstrings(): memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. options : pyarrow.compute.ScalarAggregateOptions, optional - Parameters altering compute function semantics - **kwargs : optional - Parameters for ScalarAggregateOptions constructor. Either `options` - or `**kwargs` can be passed, but not both at the same time. + Parameters altering compute function semantics. + skip_nulls : optional + Parameter for ScalarAggregateOptions constructor. Either `options` + or `skip_nulls` can be passed, but not both at the same time. + min_count : optional + Parameter for ScalarAggregateOptions constructor. Either `options` + or `min_count` can be passed, but not both at the same time. """) assert pc.add.__doc__ == textwrap.dedent("""\ Add the arguments element-wise. From e3fa4161884ae278ddab7f117e15e6d7ec5e59d2 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 29 Sep 2021 16:08:57 +0200 Subject: [PATCH 16/25] Last docstrings --- python/pyarrow/_dataset.pyx | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 9a2ce8b32d1..2275c95d0a3 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1819,6 +1819,20 @@ cdef class IpcFileFormat(FileFormat): cdef class CsvFileFormat(FileFormat): + """ + FileFormat for CSV files. + + Parameters + ---------- + parse_options : ParseOptions + Options regarding parsing of CSV. + convert_options : ConvertOptions + Options regarding value conversion. + read_options : ReadOptions + Options regarding the CSV file read operation. + default_fragment_scan_options : CsvFragmentScanOptions + Default options for fragments scan. + """ cdef: CCsvFileFormat* csv_format @@ -1888,7 +1902,16 @@ cdef class CsvFileFormat(FileFormat): cdef class CsvFragmentScanOptions(FragmentScanOptions): - """Scan-specific options for CSV fragments.""" + """ + Scan-specific options for CSV fragments. + + Parameters + ---------- + convert_options : ConvertOptions + Options regarding value conversion. + read_options : ReadOptions + Options regarding the CSV file read operation. + """ cdef: CCsvFragmentScanOptions* csv_options From 52a48d6b595e342c5fe911ff96a19a221b9c8d17 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 29 Sep 2021 16:11:48 +0200 Subject: [PATCH 17/25] review comments --- python/pyarrow/io.pxi | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 0a114607c03..cf88f09784d 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1318,14 +1318,15 @@ ctypedef CRandomAccessFile* _RandomAccessFilePtr cdef class BufferedInputStream(NativeFile): """ - Wraps an input stream making it buffered. + An InputStream that performs buffered reads from an unbuffered InputStream, + which can mitigate the overhead of many small reads in some cases Parameters ---------- stream : NativeFile The stream to wrap with the buffer buffer_size : int - Size of the buffer that should be added. + Size of the temporary read buffer. memory_pool : MemoryPool The memory pool used to allocate the buffer. """ From 8461025c7093635bae642ec40ab0f5e02fa863c9 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 29 Sep 2021 16:26:33 +0200 Subject: [PATCH 18/25] Lint --- python/pyarrow/_dataset.pyx | 12 ++++++------ python/pyarrow/_fs.pyx | 1 + python/pyarrow/tensor.pxi | 8 ++++---- python/pyarrow/types.pxi | 1 + 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 2275c95d0a3..47f390e53ba 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1159,7 +1159,7 @@ cdef class FileFragment(Fragment): class RowGroupInfo: """ A wrapper class for RowGroup information - + Parameters ---------- id : the group id. @@ -1904,7 +1904,7 @@ cdef class CsvFileFormat(FileFormat): cdef class CsvFragmentScanOptions(FragmentScanOptions): """ Scan-specific options for CSV fragments. - + Parameters ---------- convert_options : ConvertOptions @@ -2840,7 +2840,7 @@ class TaggedRecordBatch(collections.namedtuple( "TaggedRecordBatch", ["record_batch", "fragment"])): """ A combination of a record batch and the fragment it came from. - + Parameters ---------- record_batch : The record batch. @@ -3011,7 +3011,7 @@ cdef class Scanner(_Weakrefable): filter : Expression, default None Scan will return only the rows matching the filter. batch_size : int, default 1M - The maximum row count for scanned record batches. + The maximum row count for scanned record batches. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. @@ -3060,7 +3060,7 @@ cdef class Scanner(_Weakrefable): filter : Expression, default None Scan will return only the rows matching the filter. batch_size : int, default 1M - The maximum row count for scanned record batches. + The maximum row count for scanned record batches. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. @@ -3117,7 +3117,7 @@ cdef class Scanner(_Weakrefable): filter : Expression, default None Scan will return only the rows matching the filter. batch_size : int, default 1M - The maximum row count for scanned record batches. + The maximum row count for scanned record batches. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 945ffdf85af..a309a42ec86 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -1082,6 +1082,7 @@ class FileSystemHandler(ABC): # Callback definitions for CPyFileSystemVtable + cdef void _cb_get_type_name(handler, c_string* out) except *: out[0] = tobytes("py::" + handler.get_type_name()) diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 18cd797c540..7c56a87125b 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -455,7 +455,7 @@ shape: {0.shape}""".format(self) data : numpy.ndarray Data used to populate the rows. indptr : numpy.ndarray - Range of the rows, + Range of the rows, The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. indices : numpy.ndarray Column indices of the corresponding non-zero values. @@ -664,7 +664,7 @@ shape: {0.shape}""".format(self) data : numpy.ndarray Data used to populate the rows. indptr : numpy.ndarray - Range of the rows, + Range of the rows, The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. indices : numpy.ndarray Column indices of the corresponding non-zero values. @@ -708,7 +708,7 @@ shape: {0.shape}""".format(self) obj : scipy.sparse.csc_matrix The SciPy matrix that should be converted. dim_names : list, optional - Names of the dimensions. + Names of the dimensions. """ import scipy.sparse if not isinstance(obj, scipy.sparse.csc_matrix): @@ -876,7 +876,7 @@ shape: {0.shape}""".format(self) data : numpy.ndarray Data used to populate the rows. indptr : numpy.ndarray - Range of the rows, + Range of the rows, The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. indices : numpy.ndarray Column indices of the corresponding non-zero values. diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 26e9752f457..b4e0b659df5 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -963,6 +963,7 @@ cdef class KeyValueMetadata(_Metadata, Mapping): **kwargs : optional additional key-value metadata """ + def __init__(self, __arg0__=None, **kwargs): cdef: vector[c_string] keys, values From e93d0446dcfc67c6df79e0a44c63bada6353af38 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 30 Sep 2021 15:24:55 +0200 Subject: [PATCH 19/25] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- python/pyarrow/_dataset.pyx | 6 +++--- python/pyarrow/fs.py | 4 ++-- python/pyarrow/io.pxi | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 47f390e53ba..30f4a06458e 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -513,8 +513,8 @@ cdef class InMemoryDataset(Dataset): Parameters ---------- - source : The data for this dataset. - Can be a RecordBatch, Table, list of + source : + The data for this dataset. Can be a RecordBatch, Table, list of RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader. If an iterable is provided, the schema must also be provided. schema : Schema, optional @@ -3007,7 +3007,7 @@ cdef class Scanner(_Weakrefable): dataset : Dataset Dataset to scan. columns : list of str or dict, default None - The columns to project. + The columns to project. filter : Expression, default None Scan will return only the rows matching the filter. batch_size : int, default 1M diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 778c37436de..13e5d215c1e 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -260,8 +260,8 @@ class FSSpecHandler(FileSystemHandler): ---------- fs : The file system implementation according to FSSpec. - Example - ------- + Examples + -------- >>> PyFileSystem(FSSpecHandler(fsspec_fs)) """ diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index cf88f09784d..7e888bbb5da 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1442,7 +1442,7 @@ cdef void _cb_transform(transform_func, const shared_ptr[CBuffer]& src, cdef class TransformInputStream(NativeFile): """ - Transform and input stream. + Transform an input stream. Parameters ---------- From b282f37edba6be30464e90a870860f5d4ce24392 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 30 Sep 2021 15:30:02 +0200 Subject: [PATCH 20/25] Some review feedback --- python/pyarrow/_compute.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 54dbea43354..3f2d5506317 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -522,7 +522,7 @@ def call_function(name, args, options=None, memory_pool=None): options : optional options provided to the function. memory_pool : MemoryPool, optional - memory pool to use for allocations during function. + memory pool to use for allocations during function execution. """ func = _global_func_registry.get_function(name) return func.call(args, options=options, memory_pool=memory_pool) @@ -700,12 +700,12 @@ class CastOptions(_CastOptions): @staticmethod def safe(target_type=None): """" - Cast operation options. + Create a CastOptions for a safe cast. Parameters ---------- target_type : optional - Target type for the safe cast. + Target cast type for the safe cast. """ self = CastOptions() self._set_safe() @@ -715,12 +715,12 @@ class CastOptions(_CastOptions): @staticmethod def unsafe(target_type=None): """" - Cast operation options. + Create a CastOptions for an unsafe cast. Parameters ---------- target_type : optional - Target type for the unsafe cast. + Target cast type for the unsafe cast. """ self = CastOptions() self._set_unsafe() From de68f95a827a9539ebc6b6816230f847dd218b77 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 30 Sep 2021 15:54:25 +0200 Subject: [PATCH 21/25] More feedback --- python/pyarrow/_dataset.pyx | 16 +++++++++------- python/pyarrow/fs.py | 2 +- python/pyarrow/io.pxi | 9 +++++---- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 30f4a06458e..586b03aded8 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -513,8 +513,8 @@ cdef class InMemoryDataset(Dataset): Parameters ---------- - source : - The data for this dataset. Can be a RecordBatch, Table, list of + source : The data for this dataset. + Can be a RecordBatch, Table, list of RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader. If an iterable is provided, the schema must also be provided. schema : Schema, optional @@ -1825,11 +1825,11 @@ cdef class CsvFileFormat(FileFormat): Parameters ---------- parse_options : ParseOptions - Options regarding parsing of CSV. + Options regarding CSV parsing. convert_options : ConvertOptions Options regarding value conversion. read_options : ReadOptions - Options regarding the CSV file read operation. + General read options. default_fragment_scan_options : CsvFragmentScanOptions Default options for fragments scan. """ @@ -1910,7 +1910,7 @@ cdef class CsvFragmentScanOptions(FragmentScanOptions): convert_options : ConvertOptions Options regarding value conversion. read_options : ReadOptions - Options regarding the CSV file read operation. + General read options. """ cdef: @@ -3000,7 +3000,8 @@ cdef class Scanner(_Weakrefable): int batch_size=_DEFAULT_BATCH_SIZE, FragmentScanOptions fragment_scan_options=None): """ - Create Scanner from Dataset + Create Scanner from Dataset, + refer to Scanner class doc for additional details on Scanner. Parameters ---------- @@ -3047,7 +3048,8 @@ cdef class Scanner(_Weakrefable): int batch_size=_DEFAULT_BATCH_SIZE, FragmentScanOptions fragment_scan_options=None): """ - Create Scanner from Fragment + Create Scanner from Fragment, + refer to Scanner class doc for additional details on Scanner. Parameters ---------- diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 13e5d215c1e..5d33268618f 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -258,7 +258,7 @@ class FSSpecHandler(FileSystemHandler): Parameters ---------- - fs : The file system implementation according to FSSpec. + fs : The FSSpec-compliant filesystem instance. Examples -------- diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 7e888bbb5da..6dbfe7b8921 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1318,13 +1318,13 @@ ctypedef CRandomAccessFile* _RandomAccessFilePtr cdef class BufferedInputStream(NativeFile): """ - An InputStream that performs buffered reads from an unbuffered InputStream, + An input stream that performs buffered reads from an unbuffered input stream, which can mitigate the overhead of many small reads in some cases Parameters ---------- stream : NativeFile - The stream to wrap with the buffer + The input stream to wrap with the buffer buffer_size : int Size of the temporary read buffer. memory_pool : MemoryPool @@ -1381,12 +1381,13 @@ cdef class BufferedInputStream(NativeFile): cdef class BufferedOutputStream(NativeFile): """ - Wraps an output stream making it buffered. + An output stream that performs buffered writes from an unbuffered output stream, + which can mitigate the overhead of many small writes in some cases Parameters ---------- stream : NativeFile - The stream to wrap with the buffer + The writable output stream to wrap with the buffer buffer_size : int Size of the buffer that should be added. memory_pool : MemoryPool From 74d5749f997e5987c5f156c93fcac68ca6efb3b6 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 30 Sep 2021 15:58:40 +0200 Subject: [PATCH 22/25] More feedback --- python/pyarrow/io.pxi | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 6dbfe7b8921..02439e454ae 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1485,7 +1485,7 @@ def transcoding_input_stream(stream, src_encoding, dest_encoding): """ Add a transcoding transformation to the stream. Incoming data will be decoded according to ``src_encoding`` and - emitted data will be encoded according to ``dest_encoding``. + then re-encoded according to ``dest_encoding``. Parameters ---------- @@ -1547,14 +1547,14 @@ def foreign_buffer(address, size, base=None): Parameters ---------- address : int - Specify the starting address of the buffer. The address can + The starting address of the buffer. The address can refer to both device or host memory but it must be accessible from device after mapping it with `get_device_address` method. size : int - Specify the size of device buffer in bytes. + The size of device buffer in bytes. base : {None, object} - Specify object that owns the referenced memory. + Object that owns the referenced memory. """ cdef: intptr_t c_addr = address @@ -1786,8 +1786,8 @@ cdef class Codec(_Weakrefable): Parameters ---------- compression : str - Type of compression codec, valid values are: gzip, bz2, brotli, - lz4, zstd and snappy. + Type of compression codec, + refer to Codec docstring for a list of supported ones. Returns ------- @@ -1805,8 +1805,8 @@ cdef class Codec(_Weakrefable): Parameters ---------- compression : str - Type of compression codec, valid values are: gzip, bz2, brotli, - lz4, zstd and snappy. + Type of compression codec, + refer to Codec docstring for a list of supported ones. """ cdef CCompressionType typ = _ensure_compression(compression) return CCodec.SupportsCompressionLevel(typ) @@ -1820,8 +1820,8 @@ cdef class Codec(_Weakrefable): Parameters ---------- compression : str - Type of compression codec, valid values are: gzip, bz2, brotli, - lz4, zstd and snappy. + Type of compression codec, + refer to Codec docstring for a list of supported ones. """ cdef CCompressionType typ = _ensure_compression(compression) return GetResultValue(CCodec.DefaultCompressionLevel(typ)) @@ -1834,8 +1834,8 @@ cdef class Codec(_Weakrefable): Parameters ---------- compression : str - Type of compression codec, valid values are: gzip, bz2, brotli, - lz4, zstd and snappy. + Type of compression codec, + refer to Codec docstring for a list of supported ones. """ cdef CCompressionType typ = _ensure_compression(compression) return GetResultValue(CCodec.MinimumCompressionLevel(typ)) @@ -1848,8 +1848,8 @@ cdef class Codec(_Weakrefable): Parameters ---------- compression : str - Type of compression codec, valid values are: gzip, bz2, brotli, - lz4, zstd and snappy. + Type of compression codec, + refer to Codec docstring for a list of supported ones. """ cdef CCompressionType typ = _ensure_compression(compression) return GetResultValue(CCodec.MaximumCompressionLevel(typ)) From ee5f3ff3d472146c1ddf9d0a54aa29b09b668080 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 30 Sep 2021 16:28:08 +0200 Subject: [PATCH 23/25] Further code review feedback --- python/pyarrow/lib.pyx | 2 +- python/pyarrow/parquet.py | 2 +- python/pyarrow/tensor.pxi | 24 ++++++++++----------- python/pyarrow/types.py | 45 --------------------------------------- 4 files changed, 14 insertions(+), 59 deletions(-) diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 7464d6c95bd..0d86df60136 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -63,7 +63,7 @@ def set_cpu_count(int count): Parameters ---------- count : int - The number of concurrent cpu that should be set. + The number of concurrent threads that should be used. See Also -------- diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 680d20093de..ad4d876b00e 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -215,7 +215,7 @@ class ParquetFile: high-latency filesystems (e.g. S3). If True, Arrow will use a background I/O thread pool. read_dictionary : list - List of names to read directly as DictionaryArray. + List of column names to read directly as DictionaryArray. coerce_int96_timestamp_unit : str, default None. Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 7c56a87125b..e21258da36f 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -179,7 +179,7 @@ shape: {0.shape}""".format(self) Coordinates of the data. shape : tuple Shape of the tensor. - dim_names : list + dim_names : list, optional Names of the dimensions. """ cdef shared_ptr[CSparseCOOTensor] csparse_tensor @@ -257,7 +257,7 @@ shape: {0.shape}""".format(self) Parameters ---------- obj : pydata.sparse.COO - The object that should be converted. + The sparse multidimensional array that should be converted. dim_names : list, optional Names of the dimensions. """ @@ -439,7 +439,7 @@ shape: {0.shape}""".format(self) Parameters ---------- obj : numpy.ndarray - The source numpy array + The dense numpy array that should be converted. dim_names : list, optional The names of the dimensions. """ @@ -453,7 +453,7 @@ shape: {0.shape}""".format(self) Parameters ---------- data : numpy.ndarray - Data used to populate the rows. + Data used to populate the sparse matrix. indptr : numpy.ndarray Range of the rows, The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. @@ -461,7 +461,7 @@ shape: {0.shape}""".format(self) Column indices of the corresponding non-zero values. shape : tuple Shape of the matrix. - dim_names : list + dim_names : list, optional Names of the dimensions. """ cdef shared_ptr[CSparseCSRMatrix] csparse_tensor @@ -534,7 +534,7 @@ shape: {0.shape}""".format(self) Parameters ---------- obj : Tensor - The tensor that should be converted. + The dense tensor that should be converted. """ cdef shared_ptr[CSparseCSRMatrix] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) @@ -662,7 +662,7 @@ shape: {0.shape}""".format(self) Parameters ---------- data : numpy.ndarray - Data used to populate the rows. + Data used to populate the sparse matrix. indptr : numpy.ndarray Range of the rows, The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. @@ -670,7 +670,7 @@ shape: {0.shape}""".format(self) Column indices of the corresponding non-zero values. shape : tuple Shape of the matrix. - dim_names : list + dim_names : list, optional Names of the dimensions. """ cdef shared_ptr[CSparseCSCMatrix] csparse_tensor @@ -706,7 +706,7 @@ shape: {0.shape}""".format(self) Parameters ---------- obj : scipy.sparse.csc_matrix - The SciPy matrix that should be converted. + The scipy matrix that should be converted. dim_names : list, optional Names of the dimensions. """ @@ -743,7 +743,7 @@ shape: {0.shape}""".format(self) Parameters ---------- obj : Tensor - The tensor that should be converted. + The dense tensor that should be converted. """ cdef shared_ptr[CSparseCSCMatrix] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) @@ -874,7 +874,7 @@ shape: {0.shape}""".format(self) Parameters ---------- data : numpy.ndarray - Data used to populate the rows. + Data used to populate the sparse tensor. indptr : numpy.ndarray Range of the rows, The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. @@ -936,7 +936,7 @@ shape: {0.shape}""".format(self) Parameters ---------- obj : Tensor - The tensor that should be converted. + The dense tensor that should be converted. """ cdef shared_ptr[CSparseCSFTensor] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 5f76cbc7f88..041946d66e6 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -48,7 +48,6 @@ def is_null(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_NA @@ -60,7 +59,6 @@ def is_boolean(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_BOOL @@ -72,7 +70,6 @@ def is_integer(t): Parameters ---------- t : DataType - type to check """ return t.id in _INTEGER_TYPES @@ -84,7 +81,6 @@ def is_signed_integer(t): Parameters ---------- t : DataType - type to check """ return t.id in _SIGNED_INTEGER_TYPES @@ -96,7 +92,6 @@ def is_unsigned_integer(t): Parameters ---------- t : DataType - type to check """ return t.id in _UNSIGNED_INTEGER_TYPES @@ -108,7 +103,6 @@ def is_int8(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_INT8 @@ -120,7 +114,6 @@ def is_int16(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_INT16 @@ -132,7 +125,6 @@ def is_int32(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_INT32 @@ -144,7 +136,6 @@ def is_int64(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_INT64 @@ -156,7 +147,6 @@ def is_uint8(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_UINT8 @@ -168,7 +158,6 @@ def is_uint16(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_UINT16 @@ -180,7 +169,6 @@ def is_uint32(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_UINT32 @@ -192,7 +180,6 @@ def is_uint64(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_UINT64 @@ -204,7 +191,6 @@ def is_floating(t): Parameters ---------- t : DataType - type to check """ return t.id in _FLOATING_TYPES @@ -216,7 +202,6 @@ def is_float16(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_HALF_FLOAT @@ -228,7 +213,6 @@ def is_float32(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_FLOAT @@ -240,7 +224,6 @@ def is_float64(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_DOUBLE @@ -252,7 +235,6 @@ def is_list(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_LIST @@ -264,7 +246,6 @@ def is_large_list(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_LARGE_LIST @@ -276,7 +257,6 @@ def is_fixed_size_list(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_FIXED_SIZE_LIST @@ -288,7 +268,6 @@ def is_struct(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_STRUCT @@ -300,7 +279,6 @@ def is_union(t): Parameters ---------- t : DataType - type to check """ return t.id in _UNION_TYPES @@ -312,7 +290,6 @@ def is_nested(t): Parameters ---------- t : DataType - type to check """ return t.id in _NESTED_TYPES @@ -324,7 +301,6 @@ def is_temporal(t): Parameters ---------- t : DataType - type to check """ return t.id in _TEMPORAL_TYPES @@ -336,7 +312,6 @@ def is_timestamp(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_TIMESTAMP @@ -348,7 +323,6 @@ def is_duration(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_DURATION @@ -360,7 +334,6 @@ def is_time(t): Parameters ---------- t : DataType - type to check """ return t.id in _TIME_TYPES @@ -372,7 +345,6 @@ def is_time32(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_TIME32 @@ -384,7 +356,6 @@ def is_time64(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_TIME64 @@ -396,7 +367,6 @@ def is_binary(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_BINARY @@ -409,7 +379,6 @@ def is_large_binary(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_LARGE_BINARY @@ -421,7 +390,6 @@ def is_unicode(t): Parameters ---------- t : DataType - type to check """ return is_string(t) @@ -433,7 +401,6 @@ def is_string(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_STRING @@ -445,7 +412,6 @@ def is_large_unicode(t): Parameters ---------- t : DataType - type to check """ return is_large_string(t) @@ -457,7 +423,6 @@ def is_large_string(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_LARGE_STRING @@ -469,7 +434,6 @@ def is_fixed_size_binary(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_FIXED_SIZE_BINARY @@ -481,7 +445,6 @@ def is_date(t): Parameters ---------- t : DataType - type to check """ return t.id in _DATE_TYPES @@ -493,7 +456,6 @@ def is_date32(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_DATE32 @@ -505,7 +467,6 @@ def is_date64(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_DATE64 @@ -517,7 +478,6 @@ def is_map(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_MAP @@ -529,7 +489,6 @@ def is_decimal(t): Parameters ---------- t : DataType - type to check """ return t.id in _DECIMAL_TYPES @@ -541,7 +500,6 @@ def is_decimal128(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_DECIMAL128 @@ -553,7 +511,6 @@ def is_decimal256(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_DECIMAL256 @@ -565,7 +522,6 @@ def is_dictionary(t): Parameters ---------- t : DataType - type to check """ return t.id == lib.Type_DICTIONARY @@ -577,6 +533,5 @@ def is_primitive(t): Parameters ---------- t : DataType - type to check """ return lib._is_primitive(t.id) From a3a0274d8d4bd2586419f3e928bbbf843494718b Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 1 Oct 2021 09:57:27 +0200 Subject: [PATCH 24/25] lint --- python/pyarrow/io.pxi | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 02439e454ae..7358cb7026f 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1318,8 +1318,9 @@ ctypedef CRandomAccessFile* _RandomAccessFilePtr cdef class BufferedInputStream(NativeFile): """ - An input stream that performs buffered reads from an unbuffered input stream, - which can mitigate the overhead of many small reads in some cases + An input stream that performs buffered reads from + an unbuffered input stream, which can mitigate the overhead + of many small reads in some cases. Parameters ---------- @@ -1381,8 +1382,9 @@ cdef class BufferedInputStream(NativeFile): cdef class BufferedOutputStream(NativeFile): """ - An output stream that performs buffered writes from an unbuffered output stream, - which can mitigate the overhead of many small writes in some cases + An output stream that performs buffered reads from + an unbuffered output stream, which can mitigate the overhead + of many small writes in some cases. Parameters ---------- From 0aa9141455dc5e811fa6a968389bee295ef30588 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 1 Oct 2021 15:25:36 +0200 Subject: [PATCH 25/25] Address docstring for CSFTensor --- python/pyarrow/tensor.pxi | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index e21258da36f..42fd4474155 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -841,6 +841,13 @@ shape: {0.shape}""".format(self) cdef class SparseCSFTensor(_Weakrefable): """ A sparse CSF tensor. + + CSF is a generalization of compressed sparse row (CSR) index. + + CSF index recursively compresses each dimension of a tensor into a set + of prefix trees. Each path from a root to leaf forms one tensor + non-zero index. CSF is implemented with two arrays of buffers and one + arrays of integers. """ def __init__(self): @@ -876,14 +883,20 @@ shape: {0.shape}""".format(self) data : numpy.ndarray Data used to populate the sparse tensor. indptr : numpy.ndarray - Range of the rows, - The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + The sparsity structure. + Each two consecutive dimensions in a tensor correspond to + a buffer in indices. + A pair of consecutive values at `indptr[dim][i]` + `indptr[dim][i + 1]` signify a range of nodes in + `indices[dim + 1]` who are children of `indices[dim][i]` node. indices : numpy.ndarray - Column indices of the corresponding non-zero values. + Stores values of nodes. + Each tensor dimension corresponds to a buffer in indptr. shape : tuple Shape of the matrix. axis_order : list, optional - The order of the axis. + the sequence in which dimensions were traversed to + produce the prefix tree. dim_names : list, optional Names of the dimensions. """