From 67e7619cf87a321cdaf4c354f829fd1177de928f Mon Sep 17 00:00:00 2001
From: forwardxu <forwardxu@apache.org>
Date: Wed, 4 Feb 2026 22:33:42 +0800
Subject: [PATCH 1/5] docs:add array type support

---
 docs/src/guide/.pages        |   1 +
 docs/src/guide/data_types.md | 341 +++++++++++++++++++++++++++++++++++
 2 files changed, 342 insertions(+)
 create mode 100644 docs/src/guide/data_types.md

diff --git a/docs/src/guide/.pages b/docs/src/guide/.pages
index 8f59e8d680f..e92b9a1182a 100644
--- a/docs/src/guide/.pages
+++ b/docs/src/guide/.pages
@@ -1,5 +1,6 @@
 nav:
   - Read and Write: read_and_write.md
+  - Data Types: data_types.md
   - Data Evolution: data_evolution.md
   - Blob API: blob.md
   - JSON Support: json.md
diff --git a/docs/src/guide/data_types.md b/docs/src/guide/data_types.md
new file mode 100644
index 00000000000..8eed116372a
--- /dev/null
+++ b/docs/src/guide/data_types.md
@@ -0,0 +1,341 @@
+# Data Types
+
+Lance uses [Apache Arrow](https://arrow.apache.org/) as its in-memory data format. This guide covers the supported data types with a focus on array types, which are essential for vector embeddings and machine learning applications.
+
+## Arrow Type System
+
+Lance supports the full Apache Arrow type system. When writing data through Python (PyArrow) or Rust (arrow-rs), the Arrow types are automatically mapped to Lance's internal representation.
+
+### Primitive Types
+
+| Arrow Type | Description | Example Use Case |
+|------------|-------------|------------------|
+| `Boolean` | True/false values | Flags, filters |
+| `Int8`, `Int16`, `Int32`, `Int64` | Signed integers | IDs, counts |
+| `UInt8`, `UInt16`, `UInt32`, `UInt64` | Unsigned integers | IDs, indices |
+| `Float16`, `Float32`, `Float64` | Floating point numbers | Measurements, scores |
+| `Decimal128`, `Decimal256` | Fixed-precision decimals | Financial data |
+| `Date32`, `Date64` | Date values | Timestamps |
+| `Time32`, `Time64` | Time values | Time of day |
+| `Timestamp` | Date and time with timezone | Event timestamps |
+| `Duration` | Time duration | Elapsed time |
+
+### String and Binary Types
+
+| Arrow Type | Description | Example Use Case |
+|------------|-------------|------------------|
+| `Utf8` | Variable-length UTF-8 string | Text, names |
+| `LargeUtf8` | Large UTF-8 string (64-bit offsets) | Large documents |
+| `Binary` | Variable-length binary data | Raw bytes |
+| `LargeBinary` | Large binary data (64-bit offsets) | Large blobs |
+| `FixedSizeBinary(n)` | Fixed-length binary data | UUIDs, hashes |
+
+## Array Types for Vector Embeddings
+
+Lance provides excellent support for array types, which are critical for storing vector embeddings in AI/ML applications.
+
+### FixedSizeList - The Preferred Type for Vector Embeddings
+
+`FixedSizeList` is the recommended type for storing fixed-dimensional vector embeddings. Each vector has the same number of dimensions, making it highly efficient for storage and computation.
+
+=== "Python"
+
+    ```python
+    import lance
+    import pyarrow as pa
+    import numpy as np
+
+    # Create a schema with a vector embedding column
+    # This defines a 128-dimensional float32 vector
+    schema = pa.schema([
+        pa.field("id", pa.int64()),
+        pa.field("text", pa.utf8()),
+        pa.field("vector", pa.list_(pa.float32(), 128)),  # FixedSizeList of 128 floats
+    ])
+
+    # Create sample data with embeddings
+    num_rows = 1000
+    vectors = np.random.rand(num_rows, 128).astype(np.float32)
+
+    table = pa.Table.from_pydict({
+        "id": list(range(num_rows)),
+        "text": [f"document_{i}" for i in range(num_rows)],
+        "vector": [v.tolist() for v in vectors],
+    }, schema=schema)
+
+    # Write to Lance format
+    ds = lance.write_dataset(table, "./embeddings.lance")
+    print(f"Created dataset with {ds.count_rows()} rows")
+    ```
+
+=== "Rust"
+
+    ```rust
+    use arrow_array::{
+        ArrayRef, FixedSizeListArray, Float32Array, Int64Array, RecordBatch, StringArray,
+    };
+    use arrow_schema::{DataType, Field, Schema};
+    use lance::dataset::WriteParams;
+    use lance::Dataset;
+    use std::sync::Arc;
+
+    #[tokio::main]
+    async fn main() -> lance::Result<()> {
+        // Define schema with a 128-dimensional vector column
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("text", DataType::Utf8, false),
+            Field::new(
+                "vector",
+                DataType::FixedSizeList(
+                    Arc::new(Field::new("item", DataType::Float32, true)),
+                    128,
+                ),
+                false,
+            ),
+        ]));
+
+        // Create sample data
+        let ids = Int64Array::from(vec![0, 1, 2]);
+        let texts = StringArray::from(vec!["doc_0", "doc_1", "doc_2"]);
+        
+        // Create vector embeddings (128-dimensional)
+        let values: Vec<f32> = (0..384).map(|i| i as f32 / 100.0).collect();
+        let values_array = Float32Array::from(values);
+        let vectors = FixedSizeListArray::try_new_from_values(values_array, 128)?;
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(ids) as ArrayRef,
+                Arc::new(texts) as ArrayRef,
+                Arc::new(vectors) as ArrayRef,
+            ],
+        )?;
+
+        // Write to Lance
+        let dataset = Dataset::write(
+            vec![batch].into_iter().map(Ok),
+            "embeddings.lance",
+            WriteParams::default(),
+        )
+        .await?;
+
+        println!("Created dataset with {} rows", dataset.count_rows().await?);
+        Ok(())
+    }
+    ```
+
+### Vector Search with Embeddings
+
+Once you have vector embeddings stored in Lance, you can perform efficient vector similarity search:
+
+```python
+import lance
+import numpy as np
+
+# Open the dataset
+ds = lance.dataset("./embeddings.lance")
+
+# Create a query vector (same dimension as stored vectors)
+query_vector = np.random.rand(128).astype(np.float32).tolist()
+
+# Perform vector search - find 10 nearest neighbors
+results = ds.to_table(
+    nearest={
+        "column": "vector",
+        "q": query_vector,
+        "k": 10,
+    }
+)
+print(results.to_pandas())
+```
+
+For production workloads with large datasets, create a vector index for much faster search:
+
+```python
+# Create an IVF-PQ index for fast approximate nearest neighbor search
+ds.create_index(
+    "vector",
+    index_type="IVF_PQ",
+    num_partitions=256,  # Number of IVF partitions
+    num_sub_vectors=16,  # Number of PQ sub-vectors
+)
+
+# Search with the index (automatically used)
+results = ds.to_table(
+    nearest={
+        "column": "vector",
+        "q": query_vector,
+        "k": 10,
+        "nprobes": 20,  # Number of partitions to search
+    }
+)
+```
+
+### List and LargeList - Variable-Length Arrays
+
+For variable-length arrays where each row may have a different number of elements, use `List` or `LargeList`:
+
+```python
+import lance
+import pyarrow as pa
+
+# Schema with variable-length arrays
+schema = pa.schema([
+    pa.field("id", pa.int64()),
+    pa.field("tags", pa.list_(pa.utf8())),      # Variable number of string tags
+    pa.field("scores", pa.list_(pa.float32())), # Variable number of float scores
+])
+
+table = pa.Table.from_pydict({
+    "id": [1, 2, 3],
+    "tags": [["python", "ml"], ["rust"], ["data", "analytics", "ai"]],
+    "scores": [[0.9, 0.8], [0.95], [0.7, 0.85, 0.9]],
+}, schema=schema)
+
+ds = lance.write_dataset(table, "./variable_arrays.lance")
+```
+
+## Nested and Complex Types
+
+### Struct Types
+
+Store structured data with multiple named fields:
+
+```python
+import lance
+import pyarrow as pa
+
+# Schema with nested struct
+schema = pa.schema([
+    pa.field("id", pa.int64()),
+    pa.field("metadata", pa.struct([
+        pa.field("source", pa.utf8()),
+        pa.field("timestamp", pa.timestamp("us")),
+        pa.field("embedding_model", pa.utf8()),
+    ])),
+    pa.field("vector", pa.list_(pa.float32(), 384)),  # 384-dim embedding
+])
+
+table = pa.Table.from_pydict({
+    "id": [1, 2],
+    "metadata": [
+        {"source": "web", "timestamp": "2024-01-15T10:30:00", "embedding_model": "text-embedding-3-small"},
+        {"source": "api", "timestamp": "2024-01-15T11:45:00", "embedding_model": "text-embedding-3-small"},
+    ],
+    "vector": [
+        [0.1] * 384,
+        [0.2] * 384,
+    ],
+}, schema=schema)
+
+ds = lance.write_dataset(table, "./with_metadata.lance")
+```
+
+### Map Types
+
+Store key-value pairs with dynamic keys:
+
+```python
+import lance
+import pyarrow as pa
+
+schema = pa.schema([
+    pa.field("id", pa.int64()),
+    pa.field("attributes", pa.map_(pa.utf8(), pa.utf8())),
+])
+
+table = pa.Table.from_pydict({
+    "id": [1, 2],
+    "attributes": [
+        [("color", "red"), ("size", "large")],
+        [("color", "blue"), ("material", "cotton")],
+    ],
+}, schema=schema)
+
+ds = lance.write_dataset(table, "./with_maps.lance")
+```
+
+## Data Type Mapping for Integrations
+
+When integrating Lance with other systems (like Apache Flink, Spark, or Presto), the following type mappings apply:
+
+| External Type | Lance/Arrow Type | Notes |
+|--------------|------------------|-------|
+| `BOOLEAN` | `Boolean` | |
+| `TINYINT` | `Int8` | |
+| `SMALLINT` | `Int16` | |
+| `INT` / `INTEGER` | `Int32` | |
+| `BIGINT` | `Int64` | |
+| `FLOAT` | `Float32` | |
+| `DOUBLE` | `Float64` | |
+| `DECIMAL(p,s)` | `Decimal128(p,s)` | |
+| `STRING` / `VARCHAR` | `Utf8` | |
+| `CHAR(n)` | `Utf8` | Fixed-width string |
+| `DATE` | `Date32` | |
+| `TIME` | `Time64` | Microsecond precision |
+| `TIMESTAMP` | `Timestamp` | |
+| `TIMESTAMP WITH LOCAL TIMEZONE` | `Timestamp` | With timezone info |
+| `BINARY` / `VARBINARY` | `Binary` | |
+| `BYTES` | `Binary` | |
+| `ARRAY<T>` | `List(T)` | Variable-length array |
+| `ARRAY<T>(n)` | `FixedSizeList(T, n)` | Fixed-length array (vectors) |
+| `ROW` / `STRUCT` | `Struct` | Nested structure |
+| `MAP<K,V>` | `Map(K, V)` | Key-value pairs |
+
+### Vector Embeddings in Integrations
+
+For vector embedding columns, use `ARRAY<FLOAT>(n)` or `ARRAY<DOUBLE>(n)` where `n` is the embedding dimension:
+
+```sql
+-- Example: Creating a table with vector embeddings in SQL-compatible systems
+CREATE TABLE embeddings (
+    id BIGINT,
+    text STRING,
+    vector ARRAY<FLOAT>(384)  -- 384-dimensional vector
+);
+```
+
+This maps to Lance's `FixedSizeList(Float32, 384)` type, which is optimized for:
+
+- Efficient columnar storage
+- SIMD-accelerated distance computations
+- Vector index creation and search
+
+## Best Practices for Vector Data
+
+1. **Use FixedSizeList for embeddings**: Always use `FixedSizeList` (not variable-length `List`) for vector embeddings to enable efficient storage and indexing.
+
+2. **Choose appropriate precision**: 
+   - `Float32` is the standard choice, balancing precision and storage
+   - `Float16` or `BFloat16` can reduce storage by 50% with minimal accuracy loss
+   - `Int8` for quantized embeddings
+
+3. **Align dimensions for SIMD**: Vector dimensions divisible by 8 enable optimal SIMD acceleration. Common dimensions: 128, 256, 384, 512, 768, 1024, 1536.
+
+4. **Create indexes for large datasets**: For datasets with more than ~10,000 vectors, create an ANN index for fast search:
+
+    ```python
+    # IVF_PQ is recommended for most use cases
+    ds.create_index("vector", index_type="IVF_PQ", num_partitions=256, num_sub_vectors=16)
+    
+    # IVF_HNSW_SQ offers better recall at the cost of more memory
+    ds.create_index("vector", index_type="IVF_HNSW_SQ", num_partitions=256)
+    ```
+
+5. **Store metadata alongside vectors**: Lance efficiently handles mixed workloads with both vector and scalar data:
+
+    ```python
+    # Combine vector search with metadata filtering
+    results = ds.to_table(
+        filter="category = 'electronics'",
+        nearest={"column": "vector", "q": query, "k": 10}
+    )
+    ```
+
+## See Also
+
+- [Vector Search Tutorial](../quickstart/vector-search.md) - Complete guide to vector search with Lance
+- [Extension Arrays](arrays.md) - Special array types for ML (BFloat16, images)
+- [Performance Guide](performance.md) - Optimization tips for large-scale deployments

From a6800a8b274aa68fa90aa29ade97c0e452dde74e Mon Sep 17 00:00:00 2001
From: forwardxu <forwardxu@apache.org>
Date: Thu, 5 Feb 2026 14:44:40 +0800
Subject: [PATCH 2/5] docs:add array type support

---
 docs/src/guide/data_types.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/guide/data_types.md b/docs/src/guide/data_types.md
index 8eed116372a..080e0ca3929 100644
--- a/docs/src/guide/data_types.md
+++ b/docs/src/guide/data_types.md
@@ -15,7 +15,7 @@ Lance supports the full Apache Arrow type system. When writing data through Pyth
 | `UInt8`, `UInt16`, `UInt32`, `UInt64` | Unsigned integers | IDs, indices |
 | `Float16`, `Float32`, `Float64` | Floating point numbers | Measurements, scores |
 | `Decimal128`, `Decimal256` | Fixed-precision decimals | Financial data |
-| `Date32`, `Date64` | Date values | Timestamps |
+| `Date32`, `Date64` | Date values | Birth dates, event dates |
 | `Time32`, `Time64` | Time values | Time of day |
 | `Timestamp` | Date and time with timezone | Event timestamps |
 | `Duration` | Time duration | Elapsed time |
@@ -272,7 +272,7 @@ When integrating Lance with other systems (like Apache Flink, Spark, or Presto),
 | `DOUBLE` | `Float64` | |
 | `DECIMAL(p,s)` | `Decimal128(p,s)` | |
 | `STRING` / `VARCHAR` | `Utf8` | |
-| `CHAR(n)` | `Utf8` | Fixed-width string |
+| `CHAR(n)` | `Utf8` | Fixed-width in source system; stored as variable-length Utf8 |
 | `DATE` | `Date32` | |
 | `TIME` | `Time64` | Microsecond precision |
 | `TIMESTAMP` | `Timestamp` | |

From cf69ccf7ec7c44846ab1fe3a4d89dfad9163039c Mon Sep 17 00:00:00 2001
From: forwardxu <forwardxu@apache.org>
Date: Thu, 5 Feb 2026 14:57:17 +0800
Subject: [PATCH 3/5] docs:add array type support

---
 docs/src/guide/data_types.md | 49 ++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/docs/src/guide/data_types.md b/docs/src/guide/data_types.md
index 080e0ca3929..c7853695f36 100644
--- a/docs/src/guide/data_types.md
+++ b/docs/src/guide/data_types.md
@@ -30,6 +30,53 @@ Lance supports the full Apache Arrow type system. When writing data through Pyth
 | `LargeBinary` | Large binary data (64-bit offsets) | Large blobs |
 | `FixedSizeBinary(n)` | Fixed-length binary data | UUIDs, hashes |
 
+### Blob Type for Large Binary Objects
+
+Lance provides a specialized **Blob** type for efficiently storing and retrieving very large binary objects such as videos, images, audio files, or other multimedia content. Unlike regular binary columns, blobs are stored out-of-line and support lazy loading, which means you can read portions of the data without loading everything into memory.
+
+To create a blob column, add the `lance-encoding:blob` metadata to a `LargeBinary` field:
+
+```python
+import pyarrow as pa
+import lance
+
+# Define schema with a blob column for videos
+schema = pa.schema([
+    pa.field("id", pa.int64()),
+    pa.field("filename", pa.utf8()),
+    pa.field("video", pa.large_binary(), metadata={"lance-encoding:blob": "true"}),
+])
+
+# Read video file
+with open("sample_video.mp4", "rb") as f:
+    video_data = f.read()
+
+# Create and write dataset
+table = pa.table({
+    "id": [1],
+    "filename": ["sample_video.mp4"],
+    "video": [video_data],
+}, schema=schema)
+
+ds = lance.write_dataset(table, "./videos.lance", schema=schema)
+```
+
+To read blob data, use `take_blobs()` which returns file-like objects for lazy reading:
+
+```python
+# Retrieve blob as a file-like object (lazy loading)
+blobs = ds.take_blobs("video", ids=[0])
+
+# Use with libraries that accept file-like objects
+import av  # pip install av
+with av.open(blobs[0]) as container:
+    for frame in container.decode(video=0):
+        # Process video frames without loading entire video into memory
+        pass
+```
+
+For more details, see the [Blob API Guide](blob.md).
+
 ## Array Types for Vector Embeddings
 
 Lance provides excellent support for array types, which are critical for storing vector embeddings in AI/ML applications.
@@ -279,6 +326,7 @@ When integrating Lance with other systems (like Apache Flink, Spark, or Presto),
 | `TIMESTAMP WITH LOCAL TIMEZONE` | `Timestamp` | With timezone info |
 | `BINARY` / `VARBINARY` | `Binary` | |
 | `BYTES` | `Binary` | |
+| `BLOB` | `LargeBinary` with `lance-encoding:blob` | Large binary objects with lazy loading |
 | `ARRAY<T>` | `List(T)` | Variable-length array |
 | `ARRAY<T>(n)` | `FixedSizeList(T, n)` | Fixed-length array (vectors) |
 | `ROW` / `STRUCT` | `Struct` | Nested structure |
@@ -337,5 +385,6 @@ This maps to Lance's `FixedSizeList(Float32, 384)` type, which is optimized for:
 ## See Also
 
 - [Vector Search Tutorial](../quickstart/vector-search.md) - Complete guide to vector search with Lance
+- [Blob API Guide](blob.md) - Storing and retrieving large binary objects (videos, images)
 - [Extension Arrays](arrays.md) - Special array types for ML (BFloat16, images)
 - [Performance Guide](performance.md) - Optimization tips for large-scale deployments

From 47d438f8cfde77cb8a38e94fdb4e0b2a11bc8355 Mon Sep 17 00:00:00 2001
From: forwardxu <forwardxu@apache.org>
Date: Thu, 5 Feb 2026 15:41:53 +0800
Subject: [PATCH 4/5] docs:add array type support

---
 docs/src/guide/data_types.md | 168 +++++++++++++++++------------------
 1 file changed, 84 insertions(+), 84 deletions(-)

diff --git a/docs/src/guide/data_types.md b/docs/src/guide/data_types.md
index c7853695f36..6da9d59a8e1 100644
--- a/docs/src/guide/data_types.md
+++ b/docs/src/guide/data_types.md
@@ -85,93 +85,93 @@ Lance provides excellent support for array types, which are critical for storing
 
 `FixedSizeList` is the recommended type for storing fixed-dimensional vector embeddings. Each vector has the same number of dimensions, making it highly efficient for storage and computation.
 
-=== "Python"
+#### Python Example
 
-    ```python
-    import lance
-    import pyarrow as pa
-    import numpy as np
-
-    # Create a schema with a vector embedding column
-    # This defines a 128-dimensional float32 vector
-    schema = pa.schema([
-        pa.field("id", pa.int64()),
-        pa.field("text", pa.utf8()),
-        pa.field("vector", pa.list_(pa.float32(), 128)),  # FixedSizeList of 128 floats
-    ])
-
-    # Create sample data with embeddings
-    num_rows = 1000
-    vectors = np.random.rand(num_rows, 128).astype(np.float32)
-
-    table = pa.Table.from_pydict({
-        "id": list(range(num_rows)),
-        "text": [f"document_{i}" for i in range(num_rows)],
-        "vector": [v.tolist() for v in vectors],
-    }, schema=schema)
-
-    # Write to Lance format
-    ds = lance.write_dataset(table, "./embeddings.lance")
-    print(f"Created dataset with {ds.count_rows()} rows")
-    ```
+```python
+import lance
+import pyarrow as pa
+import numpy as np
 
-=== "Rust"
-
-    ```rust
-    use arrow_array::{
-        ArrayRef, FixedSizeListArray, Float32Array, Int64Array, RecordBatch, StringArray,
-    };
-    use arrow_schema::{DataType, Field, Schema};
-    use lance::dataset::WriteParams;
-    use lance::Dataset;
-    use std::sync::Arc;
-
-    #[tokio::main]
-    async fn main() -> lance::Result<()> {
-        // Define schema with a 128-dimensional vector column
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Int64, false),
-            Field::new("text", DataType::Utf8, false),
-            Field::new(
-                "vector",
-                DataType::FixedSizeList(
-                    Arc::new(Field::new("item", DataType::Float32, true)),
-                    128,
-                ),
-                false,
+# Create a schema with a vector embedding column
+# This defines a 128-dimensional float32 vector
+schema = pa.schema([
+    pa.field("id", pa.int64()),
+    pa.field("text", pa.utf8()),
+    pa.field("vector", pa.list_(pa.float32(), 128)),  # FixedSizeList of 128 floats
+])
+
+# Create sample data with embeddings
+num_rows = 1000
+vectors = np.random.rand(num_rows, 128).astype(np.float32)
+
+table = pa.Table.from_pydict({
+    "id": list(range(num_rows)),
+    "text": [f"document_{i}" for i in range(num_rows)],
+    "vector": [v.tolist() for v in vectors],
+}, schema=schema)
+
+# Write to Lance format
+ds = lance.write_dataset(table, "./embeddings.lance")
+print(f"Created dataset with {ds.count_rows()} rows")
+```
+
+#### Rust Example
+
+```rust
+use arrow_array::{
+    ArrayRef, FixedSizeListArray, Float32Array, Int64Array, RecordBatch, StringArray,
+};
+use arrow_schema::{DataType, Field, Schema};
+use lance::dataset::WriteParams;
+use lance::Dataset;
+use std::sync::Arc;
+
+#[tokio::main]
+async fn main() -> lance::Result<()> {
+    // Define schema with a 128-dimensional vector column
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int64, false),
+        Field::new("text", DataType::Utf8, false),
+        Field::new(
+            "vector",
+            DataType::FixedSizeList(
+                Arc::new(Field::new("item", DataType::Float32, true)),
+                128,
             ),
-        ]));
-
-        // Create sample data
-        let ids = Int64Array::from(vec![0, 1, 2]);
-        let texts = StringArray::from(vec!["doc_0", "doc_1", "doc_2"]);
-        
-        // Create vector embeddings (128-dimensional)
-        let values: Vec<f32> = (0..384).map(|i| i as f32 / 100.0).collect();
-        let values_array = Float32Array::from(values);
-        let vectors = FixedSizeListArray::try_new_from_values(values_array, 128)?;
-
-        let batch = RecordBatch::try_new(
-            schema.clone(),
-            vec![
-                Arc::new(ids) as ArrayRef,
-                Arc::new(texts) as ArrayRef,
-                Arc::new(vectors) as ArrayRef,
-            ],
-        )?;
-
-        // Write to Lance
-        let dataset = Dataset::write(
-            vec![batch].into_iter().map(Ok),
-            "embeddings.lance",
-            WriteParams::default(),
-        )
-        .await?;
-
-        println!("Created dataset with {} rows", dataset.count_rows().await?);
-        Ok(())
-    }
-    ```
+            false,
+        ),
+    ]));
+
+    // Create sample data
+    let ids = Int64Array::from(vec![0, 1, 2]);
+    let texts = StringArray::from(vec!["doc_0", "doc_1", "doc_2"]);
+    
+    // Create vector embeddings (128-dimensional)
+    let values: Vec<f32> = (0..384).map(|i| i as f32 / 100.0).collect();
+    let values_array = Float32Array::from(values);
+    let vectors = FixedSizeListArray::try_new_from_values(values_array, 128)?;
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(ids) as ArrayRef,
+            Arc::new(texts) as ArrayRef,
+            Arc::new(vectors) as ArrayRef,
+        ],
+    )?;
+
+    // Write to Lance
+    let dataset = Dataset::write(
+        vec![batch].into_iter().map(Ok),
+        "embeddings.lance",
+        WriteParams::default(),
+    )
+    .await?;
+
+    println!("Created dataset with {} rows", dataset.count_rows().await?);
+    Ok(())
+}
+```
 
 ### Vector Search with Embeddings
 

From e37fbfcb3ab91361640985b922121a7b2980e42c Mon Sep 17 00:00:00 2001
From: forwardxu <forwardxu@apache.org>
Date: Thu, 5 Feb 2026 15:50:10 +0800
Subject: [PATCH 5/5] docs:add array type support

---
 docs/src/guide/data_types.md | 168 +++++++++++++++++------------------
 1 file changed, 84 insertions(+), 84 deletions(-)

diff --git a/docs/src/guide/data_types.md b/docs/src/guide/data_types.md
index 6da9d59a8e1..c7853695f36 100644
--- a/docs/src/guide/data_types.md
+++ b/docs/src/guide/data_types.md
@@ -85,93 +85,93 @@ Lance provides excellent support for array types, which are critical for storing
 
 `FixedSizeList` is the recommended type for storing fixed-dimensional vector embeddings. Each vector has the same number of dimensions, making it highly efficient for storage and computation.
 
-#### Python Example
+=== "Python"
 
-```python
-import lance
-import pyarrow as pa
-import numpy as np
-
-# Create a schema with a vector embedding column
-# This defines a 128-dimensional float32 vector
-schema = pa.schema([
-    pa.field("id", pa.int64()),
-    pa.field("text", pa.utf8()),
-    pa.field("vector", pa.list_(pa.float32(), 128)),  # FixedSizeList of 128 floats
-])
-
-# Create sample data with embeddings
-num_rows = 1000
-vectors = np.random.rand(num_rows, 128).astype(np.float32)
-
-table = pa.Table.from_pydict({
-    "id": list(range(num_rows)),
-    "text": [f"document_{i}" for i in range(num_rows)],
-    "vector": [v.tolist() for v in vectors],
-}, schema=schema)
-
-# Write to Lance format
-ds = lance.write_dataset(table, "./embeddings.lance")
-print(f"Created dataset with {ds.count_rows()} rows")
-```
+    ```python
+    import lance
+    import pyarrow as pa
+    import numpy as np
+
+    # Create a schema with a vector embedding column
+    # This defines a 128-dimensional float32 vector
+    schema = pa.schema([
+        pa.field("id", pa.int64()),
+        pa.field("text", pa.utf8()),
+        pa.field("vector", pa.list_(pa.float32(), 128)),  # FixedSizeList of 128 floats
+    ])
+
+    # Create sample data with embeddings
+    num_rows = 1000
+    vectors = np.random.rand(num_rows, 128).astype(np.float32)
+
+    table = pa.Table.from_pydict({
+        "id": list(range(num_rows)),
+        "text": [f"document_{i}" for i in range(num_rows)],
+        "vector": [v.tolist() for v in vectors],
+    }, schema=schema)
+
+    # Write to Lance format
+    ds = lance.write_dataset(table, "./embeddings.lance")
+    print(f"Created dataset with {ds.count_rows()} rows")
+    ```
 
-#### Rust Example
-
-```rust
-use arrow_array::{
-    ArrayRef, FixedSizeListArray, Float32Array, Int64Array, RecordBatch, StringArray,
-};
-use arrow_schema::{DataType, Field, Schema};
-use lance::dataset::WriteParams;
-use lance::Dataset;
-use std::sync::Arc;
-
-#[tokio::main]
-async fn main() -> lance::Result<()> {
-    // Define schema with a 128-dimensional vector column
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("id", DataType::Int64, false),
-        Field::new("text", DataType::Utf8, false),
-        Field::new(
-            "vector",
-            DataType::FixedSizeList(
-                Arc::new(Field::new("item", DataType::Float32, true)),
-                128,
+=== "Rust"
+
+    ```rust
+    use arrow_array::{
+        ArrayRef, FixedSizeListArray, Float32Array, Int64Array, RecordBatch, StringArray,
+    };
+    use arrow_schema::{DataType, Field, Schema};
+    use lance::dataset::WriteParams;
+    use lance::Dataset;
+    use std::sync::Arc;
+
+    #[tokio::main]
+    async fn main() -> lance::Result<()> {
+        // Define schema with a 128-dimensional vector column
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("text", DataType::Utf8, false),
+            Field::new(
+                "vector",
+                DataType::FixedSizeList(
+                    Arc::new(Field::new("item", DataType::Float32, true)),
+                    128,
+                ),
+                false,
             ),
-            false,
-        ),
-    ]));
-
-    // Create sample data
-    let ids = Int64Array::from(vec![0, 1, 2]);
-    let texts = StringArray::from(vec!["doc_0", "doc_1", "doc_2"]);
-    
-    // Create vector embeddings (128-dimensional)
-    let values: Vec<f32> = (0..384).map(|i| i as f32 / 100.0).collect();
-    let values_array = Float32Array::from(values);
-    let vectors = FixedSizeListArray::try_new_from_values(values_array, 128)?;
-
-    let batch = RecordBatch::try_new(
-        schema.clone(),
-        vec![
-            Arc::new(ids) as ArrayRef,
-            Arc::new(texts) as ArrayRef,
-            Arc::new(vectors) as ArrayRef,
-        ],
-    )?;
-
-    // Write to Lance
-    let dataset = Dataset::write(
-        vec![batch].into_iter().map(Ok),
-        "embeddings.lance",
-        WriteParams::default(),
-    )
-    .await?;
-
-    println!("Created dataset with {} rows", dataset.count_rows().await?);
-    Ok(())
-}
-```
+        ]));
+
+        // Create sample data
+        let ids = Int64Array::from(vec![0, 1, 2]);
+        let texts = StringArray::from(vec!["doc_0", "doc_1", "doc_2"]);
+        
+        // Create vector embeddings (128-dimensional)
+        let values: Vec<f32> = (0..384).map(|i| i as f32 / 100.0).collect();
+        let values_array = Float32Array::from(values);
+        let vectors = FixedSizeListArray::try_new_from_values(values_array, 128)?;
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(ids) as ArrayRef,
+                Arc::new(texts) as ArrayRef,
+                Arc::new(vectors) as ArrayRef,
+            ],
+        )?;
+
+        // Write to Lance
+        let dataset = Dataset::write(
+            vec![batch].into_iter().map(Ok),
+            "embeddings.lance",
+            WriteParams::default(),
+        )
+        .await?;
+
+        println!("Created dataset with {} rows", dataset.count_rows().await?);
+        Ok(())
+    }
+    ```
 
 ### Vector Search with Embeddings