diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 0c1fea6f4161..05628084c844 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -54,11 +54,77 @@ impl OffsetSizeTrait for i64 { const PREFIX: &'static str = "Large"; } -/// An array of [variable length arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout) +/// An array of [variable length lists], similar to JSON arrays +/// (e.g. `["A", "B", "C"]`). /// -/// See [`ListArray`] and [`LargeListArray`]` +/// Lists are represented using `offsets` into a `values` child +/// array. Offsets are stored in two adjacent entries of an +/// [`OffsetBuffer`]. /// -/// See [`GenericListBuilder`](crate::builder::GenericListBuilder) for how to construct a [`GenericListArray`] +/// Arrow defines [`ListArray`] with `i32` offsets and +/// [`LargeListArray`] with `i64` offsets. +/// +/// Use [`GenericListBuilder`](crate::builder::GenericListBuilder) to +/// construct a [`GenericListArray`]. +/// +/// # Representation +/// +/// A [`ListArray`] can represent a list of values of any other +/// supported Arrow type. Each element of the `ListArray` itself is +/// a list which may be empty, may contain NULL and non-null values, +/// or may itself be NULL. +/// +/// For example, the `ListArray` shown in the following diagram stores +/// lists of strings. Note that `[]` represents an empty (length +/// 0), but non NULL list. +/// +/// ```text +/// ┌─────────────┐ +/// │ [A,B,C] │ +/// ├─────────────┤ +/// │ [] │ +/// ├─────────────┤ +/// │ NULL │ +/// ├─────────────┤ +/// │ [D] │ +/// ├─────────────┤ +/// │ [NULL, F] │ +/// └─────────────┘ +/// ``` +/// +/// The `values` are stored in a child [`StringArray`] and the offsets +/// are stored in an [`OffsetBuffer`] as shown in the following +/// diagram. The logical values and offsets are shown on the left, and +/// the actual `ListArray` encoding on the right. +/// +/// ```text +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ +/// ┌ ─ ─ ─ ─ ─ ─ ┐ │ +/// ┌─────────────┐ ┌───────┐ │ ┌───┐ ┌───┐ ┌───┐ ┌───┐ +/// │ [A,B,C] │ │ (0,3) │ │ 1 │ │ 0 │ │ │ 1 │ │ A │ │ 0 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [] │ │ (3,3) │ │ 1 │ │ 3 │ │ │ 1 │ │ B │ │ 1 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ NULL │ │ (3,4) │ │ 0 │ │ 3 │ │ │ 1 │ │ C │ │ 2 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [D] │ │ (4,5) │ │ 1 │ │ 4 │ │ │ ? │ │ ? │ │ 3 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [NULL, F] │ │ (5,7) │ │ 1 │ │ 5 │ │ │ 1 │ │ D │ │ 4 │ +/// └─────────────┘ └───────┘ │ └───┘ ├───┤ ├───┤ ├───┤ +/// │ 7 │ │ │ 0 │ │ ? │ │ 5 │ +/// │ Validity └───┘ ├───┤ ├───┤ +/// Logical Logical (nulls) Offsets │ │ 1 │ │ F │ │ 6 │ +/// Values Offsets │ └───┘ └───┘ +/// │ Values │ │ +/// (offsets[i], │ ListArray (Array) +/// offsets[i+1]) └ ─ ─ ─ ─ ─ ─ ┘ │ +/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ +/// +/// +/// ``` +/// +/// [`StringArray`]: crate::array::StringArray +/// [variable length lists]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout pub struct GenericListArray { data_type: DataType, nulls: Option, diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index b31814615fc9..5cc7f7b04e0a 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -28,6 +28,60 @@ use std::sync::Arc; /// /// Use [`ListBuilder`] to build [`ListArray`]s and [`LargeListBuilder`] to build [`LargeListArray`]s. /// +/// # Example +/// +/// Here is code that constructs a ListArray with the contents: +/// `[[A,B,C], [], NULL, [D], [NULL, F]]` +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{builder::ListBuilder, builder::StringBuilder, ArrayRef, StringArray, Array}; +/// # +/// let values_builder = StringBuilder::new(); +/// let mut builder = ListBuilder::new(values_builder); +/// +/// // [A, B, C] +/// builder.values().append_value("A"); +/// builder.values().append_value("B"); +/// builder.values().append_value("C"); +/// builder.append(true); +/// +/// // [ ] (empty list) +/// builder.append(true); +/// +/// // Null +/// builder.values().append_value("?"); // irrelevant +/// builder.append(false); +/// +/// // [D] +/// builder.values().append_value("D"); +/// builder.append(true); +/// +/// // [NULL, F] +/// builder.values().append_null(); +/// builder.values().append_value("F"); +/// builder.append(true); +/// +/// // Build the array +/// let array = builder.finish(); +/// +/// // Values is a string array +/// // "A", "B" "C", "?", "D", NULL, "F" +/// assert_eq!( +/// array.values().as_ref(), +/// &StringArray::from(vec![ +/// Some("A"), Some("B"), Some("C"), +/// Some("?"), Some("D"), None, +/// Some("F") +/// ]) +/// ); +/// +/// // Offsets are indexes into the values array +/// assert_eq!( +/// array.value_offsets(), +/// &[0, 3, 3, 4, 5, 7] +/// ); +/// ``` /// /// [`ListBuilder`]: crate::builder::ListBuilder /// [`ListArray`]: crate::array::ListArray diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 0111d12fbab1..fede32c57924 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -19,7 +19,43 @@ use crate::buffer::ScalarBuffer; use crate::{ArrowNativeType, MutableBuffer}; use std::ops::Deref; -/// A non-empty buffer of monotonically increasing, positive integers +/// A non-empty buffer of monotonically increasing, positive integers. +/// +/// [`OffsetBuffer`] are used to represent ranges of offsets. An +/// `OffsetBuffer` of `N+1` items contains `N` such ranges. The start +/// offset for element `i` is `offsets[i]` and the end offset is +/// `offsets[i+1]`. Equal offsets represent an empty range. +/// +/// # Example +/// +/// This example shows how 5 distinct ranges, are represented using a +/// 6 entry `OffsetBuffer`. The first entry `(0, 3)` represents the +/// three offsets `0, 1, 2`. The entry `(3,3)` represent no offsets +/// (e.g. an empty list). +/// +/// ```text +/// ┌───────┐ ┌───┐ +/// │ (0,3) │ │ 0 │ +/// ├───────┤ ├───┤ +/// │ (3,3) │ │ 3 │ +/// ├───────┤ ├───┤ +/// │ (3,4) │ │ 3 │ +/// ├───────┤ ├───┤ +/// │ (4,5) │ │ 4 │ +/// ├───────┤ ├───┤ +/// │ (5,7) │ │ 5 │ +/// └───────┘ ├───┤ +/// │ 7 │ +/// └───┘ +/// +/// Offsets Buffer +/// Logical +/// Offsets +/// +/// (offsets[i], +/// offsets[i+1]) +/// ``` + #[derive(Debug, Clone)] pub struct OffsetBuffer(ScalarBuffer);