From aea1039dd0e3432ce8d0660abe87e50a7b489f5f Mon Sep 17 00:00:00 2001 From: lijinglun Date: Wed, 1 Oct 2025 13:11:40 +0800 Subject: [PATCH 1/4] docs: add json to full text search document --- docs/src/format/table/index/scalar/fts.md | 60 ++++++++++++++++++++++- python/python/lance/dataset.py | 20 ++++++-- python/python/tests/test_scalar_index.py | 53 ++++++++++++++++++++ python/src/dataset.rs | 3 ++ 4 files changed, 131 insertions(+), 5 deletions(-) diff --git a/docs/src/format/table/index/scalar/fts.md b/docs/src/format/table/index/scalar/fts.md index 15f3133210e..4b509761605 100644 --- a/docs/src/format/table/index/scalar/fts.md +++ b/docs/src/format/table/index/scalar/fts.md @@ -54,6 +54,7 @@ The metadata file contains JSON-serialized configuration and partition informati | Field | Type | Default | Description | |---------------------|---------|-----------|----------------------------------------------------------------| +| `lance_tokenizer` | String | "text" | Lance tokenizer type (see Tokenizers section) | | `base_tokenizer` | String | "simple" | Base tokenizer type (see Tokenizers section) | | `language` | String | "English" | Language for stemming and stop words | | `with_position` | Boolean | false | Store term positions for phrase queries (increases index size) | @@ -68,7 +69,64 @@ The metadata file contains JSON-serialized configuration and partition informati ## Tokenizers -The full text search index supports multiple tokenizer types for different text processing needs: +The full text search index supports multiple tokenizer types for different text processing needs. +There are two different tokenizer configurations: `lance_tokenizer` and `base_tokenizer`. + +The `lance_tokenizer` is responsible for handling different document types, such as text and json, +while the `base_tokenizer` is responsible for tokenizing documents. + +### Lance Tokenizers +| Tokenizer | Description | Use Case | +|-----------|----------------------------------------------------------------------|-------------------------| +| **text** | Parse TEXT document into tokens. | Text document (default) | +| **json** | Parse JSON document into tokens in triplet format `path,type,value`. | Json document | + +#### Text Tokenizer +Text Tokenizer is responsible for handling TEXT-type data, which is Utf8, LargeUtf8 or List of them in arrow format. +The Text Tokenizer behaves consistently in both query and document parsing scenarios, which means that if a document +contains the word "lance," we can retrieve it using a query with "lance". + +#### Json Tokenizer +The Json Tokenizer is responsible for handling JSON-type data, which is the JSON type in arrow format. +Unlike Text Tokenizer, the Json Tokenizer behaves differently in "query" and "document parsing" scenarios. + +JSON is a nested structure, a JSON document can always be converted into triplets in format: `path,type,value`. That's +how lance handle JSON during document parsing, breaking down JSON document into tokens in the triplet format. +In scenarios where the triplet value is a text, the text value will be further tokenized using the base_tokenizer, +resulting in multiple triplet tokens. + +During "querying," the Json Tokenizer uses the triplet format instead of the JSON format, which simplifies the query +syntax. + +The example below shows how the Json Tokenizer works. Assume we have the following JSON document: +```json +{ + "name": "Lance", + "legal.age": 30, + "address": { + "city": "San Francisco", + "zip:us": 94102 + } +} +``` + +After parsing, the document will be tokenized into the following tokens: +``` +name,str,Lance +legal.age,number,30 +address.city,str,San +address.city,str,Francisco +address.zip:us,number,94102 +``` + +Then we do full text search in triplet format. To search for "San Francisco," we can search with one of the triplets +below: +``` +address.city:San Francisco +address.city:San +address.city:Francisco +``` + ### Base Tokenizers diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 783a0772b97..ce51f66ffee 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2397,12 +2397,12 @@ def create_scalar_index( value_type = field_type if pa.types.is_list(field_type) or pa.types.is_large_list(field_type): value_type = field_type.value_type - if not pa.types.is_string(value_type) and not pa.types.is_large_string( - value_type - ): + if (not pa.types.is_string(value_type) + and not pa.types.is_large_string(value_type) + and not _is_json_column(field)): raise TypeError( f"INVERTED index column {column} must be string, large string" - " or list of strings, but got {value_type}" + " list of strings or json, but got {value_type}" ) if pa.types.is_duration(field_type): @@ -3452,6 +3452,18 @@ def centroids( return ivf.centroids +def _is_json_column(field: pyarrow.lib.Field): + field_type = field.type + if hasattr(field_type, "storage_type"): + field_type = field_type.storage_type + + if not pa.types.is_large_binary(field_type): + return False + if b'ARROW:extension:name' in field.metadata: + return field.metadata[b'ARROW:extension:name'] == b"lance.json" + return False + + class SqlQuery: """ An executable SQL query. diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index d33ca4a9b6d..f5b2b4e3904 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -1777,6 +1777,59 @@ def test_zonemap_index_remapping(tmp_path: Path): result = scanner.to_table() assert result.num_rows == 501 # 1000..1500 inclusive +def test_json_inverted_index(): + vals = [ + """{ + "name": "Tom", + "legal.age": 30, + "address": { + "city": "San Francisco", + "zip:us": 94102 + } + }""", + """{ + "name": "Alice", + "legal.age": 30, + "address": { + "city": "New York City", + "zip:us": 11215 + } + }""", + """{ + "name": "Bob", + "legal.age": 32, + "address": { + "city": "Las Vegas", + "zip:us": 89101 + } + }""" + ] + tbl = pa.table({"jsons": pa.array(vals, pa.json_())}) + ds = lance.write_dataset(tbl, "memory://test") + + ds.create_scalar_index( + "jsons", + index_type="INVERTED", + lance_tokenizer="json", + ) + result = ds.to_table(columns=["jsons"], + full_text_query=MatchQuery( + "legal.age,number,30", + "jsons")) + assert result.num_rows == 2 + + result = ds.to_table(columns=["jsons"], + full_text_query=MatchQuery( + "address.zip:us,number,89101", + "jsons")) + assert result.num_rows == 1 + + result = ds.to_table(columns=["jsons"], + full_text_query=MatchQuery( + "address.city,str,Francisco", + "jsons")) + assert result.num_rows == 1 + def test_json_index(): vals = ['{"x": 7, "y": 10}', '{"x": 11, "y": 22}', '{"y": 0}', '{"x": 10}'] diff --git a/python/src/dataset.rs b/python/src/dataset.rs index c9c56c95673..49b275b8e5a 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -1742,6 +1742,9 @@ impl Dataset { if let Some(with_position) = kwargs.get_item("with_position")? { params = params.with_position(with_position.extract()?); } + if let Some(lance_tokenizer) = kwargs.get_item("lance_tokenizer")? { + params = params.lance_tokenizer(lance_tokenizer.extract()?); + } if let Some(base_tokenizer) = kwargs.get_item("base_tokenizer")? { params = params.base_tokenizer(base_tokenizer.extract()?); } From 596c55bd5002a772e2d06aba92829cd237172b0c Mon Sep 17 00:00:00 2001 From: lijinglun Date: Fri, 3 Oct 2025 17:00:16 +0800 Subject: [PATCH 2/4] fmt --- docs/src/format/table/index/scalar/fts.md | 4 ++-- python/python/lance/dataset.py | 12 ++++++----- python/python/tests/test_scalar_index.py | 26 +++++++++++------------ 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/docs/src/format/table/index/scalar/fts.md b/docs/src/format/table/index/scalar/fts.md index 4b509761605..ec45ca72366 100644 --- a/docs/src/format/table/index/scalar/fts.md +++ b/docs/src/format/table/index/scalar/fts.md @@ -88,14 +88,14 @@ contains the word "lance," we can retrieve it using a query with "lance". #### Json Tokenizer The Json Tokenizer is responsible for handling JSON-type data, which is the JSON type in arrow format. -Unlike Text Tokenizer, the Json Tokenizer behaves differently in "query" and "document parsing" scenarios. +Unlike Text Tokenizer, the Json Tokenizer behaves differently in query and document parsing scenarios. JSON is a nested structure, a JSON document can always be converted into triplets in format: `path,type,value`. That's how lance handle JSON during document parsing, breaking down JSON document into tokens in the triplet format. In scenarios where the triplet value is a text, the text value will be further tokenized using the base_tokenizer, resulting in multiple triplet tokens. -During "querying," the Json Tokenizer uses the triplet format instead of the JSON format, which simplifies the query +During querying, the Json Tokenizer uses the triplet format instead of the JSON format, which simplifies the query syntax. The example below shows how the Json Tokenizer works. Assume we have the following JSON document: diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index ce51f66ffee..b702f787059 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2397,9 +2397,11 @@ def create_scalar_index( value_type = field_type if pa.types.is_list(field_type) or pa.types.is_large_list(field_type): value_type = field_type.value_type - if (not pa.types.is_string(value_type) - and not pa.types.is_large_string(value_type) - and not _is_json_column(field)): + if ( + not pa.types.is_string(value_type) + and not pa.types.is_large_string(value_type) + and not _is_json_column(field) + ): raise TypeError( f"INVERTED index column {column} must be string, large string" " list of strings or json, but got {value_type}" @@ -3459,8 +3461,8 @@ def _is_json_column(field: pyarrow.lib.Field): if not pa.types.is_large_binary(field_type): return False - if b'ARROW:extension:name' in field.metadata: - return field.metadata[b'ARROW:extension:name'] == b"lance.json" + if b"ARROW:extension:name" in field.metadata: + return field.metadata[b"ARROW:extension:name"] == b"lance.json" return False diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index f5b2b4e3904..dc66a30a9f6 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -1777,6 +1777,7 @@ def test_zonemap_index_remapping(tmp_path: Path): result = scanner.to_table() assert result.num_rows == 501 # 1000..1500 inclusive + def test_json_inverted_index(): vals = [ """{ @@ -1802,7 +1803,7 @@ def test_json_inverted_index(): "city": "Las Vegas", "zip:us": 89101 } - }""" + }""", ] tbl = pa.table({"jsons": pa.array(vals, pa.json_())}) ds = lance.write_dataset(tbl, "memory://test") @@ -1812,22 +1813,21 @@ def test_json_inverted_index(): index_type="INVERTED", lance_tokenizer="json", ) - result = ds.to_table(columns=["jsons"], - full_text_query=MatchQuery( - "legal.age,number,30", - "jsons")) + result = ds.to_table( + columns=["jsons"], full_text_query=MatchQuery("legal.age,number,30", "jsons") + ) assert result.num_rows == 2 - result = ds.to_table(columns=["jsons"], - full_text_query=MatchQuery( - "address.zip:us,number,89101", - "jsons")) + result = ds.to_table( + columns=["jsons"], + full_text_query=MatchQuery("address.zip:us,number,89101", "jsons"), + ) assert result.num_rows == 1 - result = ds.to_table(columns=["jsons"], - full_text_query=MatchQuery( - "address.city,str,Francisco", - "jsons")) + result = ds.to_table( + columns=["jsons"], + full_text_query=MatchQuery("address.city,str,Francisco", "jsons"), + ) assert result.num_rows == 1 From 93f778574756c04d4c6f8bbd55e942da78964ef9 Mon Sep 17 00:00:00 2001 From: lijinglun Date: Sun, 26 Oct 2025 22:39:49 +0800 Subject: [PATCH 3/4] fmt --- docs/src/format/table/index/scalar/fts.md | 25 ++++++----- python/python/lance/dataset.py | 20 ++------- python/python/tests/test_scalar_index.py | 53 ----------------------- python/src/dataset.rs | 3 -- 4 files changed, 17 insertions(+), 84 deletions(-) diff --git a/docs/src/format/table/index/scalar/fts.md b/docs/src/format/table/index/scalar/fts.md index ec45ca72366..b8bd48393ef 100644 --- a/docs/src/format/table/index/scalar/fts.md +++ b/docs/src/format/table/index/scalar/fts.md @@ -54,7 +54,7 @@ The metadata file contains JSON-serialized configuration and partition informati | Field | Type | Default | Description | |---------------------|---------|-----------|----------------------------------------------------------------| -| `lance_tokenizer` | String | "text" | Lance tokenizer type (see Tokenizers section) | +| `lance_tokenizer` | String | None | Tokenizer for document type (see Tokenizers section) | | `base_tokenizer` | String | "simple" | Base tokenizer type (see Tokenizers section) | | `language` | String | "English" | Language for stemming and stop words | | `with_position` | Boolean | false | Store term positions for phrase queries (increases index size) | @@ -69,22 +69,25 @@ The metadata file contains JSON-serialized configuration and partition informati ## Tokenizers -The full text search index supports multiple tokenizer types for different text processing needs. -There are two different tokenizer configurations: `lance_tokenizer` and `base_tokenizer`. +There are two tokenizers in InvertedIndexParams: `lance_tokenizer` and `base_tokenizer`. The `lance_tokenizer` is +responsible for parsing different document types, such as text and json. It will be auto-inferred based on the +document type, user shouldn't specify it manually. See [Lance Tokenizers](#lance-tokenizers). -The `lance_tokenizer` is responsible for handling different document types, such as text and json, -while the `base_tokenizer` is responsible for tokenizing documents. +The `base_tokenizer` is responsible for tokenizing documents. It supports multiple types for different text processing +needs, see [Base Tokenizers](#base-tokenizers). ### Lance Tokenizers -| Tokenizer | Description | Use Case | -|-----------|----------------------------------------------------------------------|-------------------------| -| **text** | Parse TEXT document into tokens. | Text document (default) | -| **json** | Parse JSON document into tokens in triplet format `path,type,value`. | Json document | +| Tokenizer | Description | Use Case | +|-----------|----------------------------------------------------------------------|-----------------| +| **text** | Parse TEXT document into tokens. | Text document | +| **json** | Parse JSON document into tokens in triplet format `path,type,value`. | Json document | #### Text Tokenizer Text Tokenizer is responsible for handling TEXT-type data, which is Utf8, LargeUtf8 or List of them in arrow format. -The Text Tokenizer behaves consistently in both query and document parsing scenarios, which means that if a document -contains the word "lance," we can retrieve it using a query with "lance". + +Text Tokenizer applies the same rules when tokenizing query statements and documents. This rule may not necessarily +apply to other document types. As document types become more complex, tokenizer might use different rules for query +tokenization and document tokenization, such as the JSON Tokenizer shown below. #### Json Tokenizer The Json Tokenizer is responsible for handling JSON-type data, which is the JSON type in arrow format. diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index b702f787059..783a0772b97 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2397,14 +2397,12 @@ def create_scalar_index( value_type = field_type if pa.types.is_list(field_type) or pa.types.is_large_list(field_type): value_type = field_type.value_type - if ( - not pa.types.is_string(value_type) - and not pa.types.is_large_string(value_type) - and not _is_json_column(field) + if not pa.types.is_string(value_type) and not pa.types.is_large_string( + value_type ): raise TypeError( f"INVERTED index column {column} must be string, large string" - " list of strings or json, but got {value_type}" + " or list of strings, but got {value_type}" ) if pa.types.is_duration(field_type): @@ -3454,18 +3452,6 @@ def centroids( return ivf.centroids -def _is_json_column(field: pyarrow.lib.Field): - field_type = field.type - if hasattr(field_type, "storage_type"): - field_type = field_type.storage_type - - if not pa.types.is_large_binary(field_type): - return False - if b"ARROW:extension:name" in field.metadata: - return field.metadata[b"ARROW:extension:name"] == b"lance.json" - return False - - class SqlQuery: """ An executable SQL query. diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index dc66a30a9f6..d33ca4a9b6d 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -1778,59 +1778,6 @@ def test_zonemap_index_remapping(tmp_path: Path): assert result.num_rows == 501 # 1000..1500 inclusive -def test_json_inverted_index(): - vals = [ - """{ - "name": "Tom", - "legal.age": 30, - "address": { - "city": "San Francisco", - "zip:us": 94102 - } - }""", - """{ - "name": "Alice", - "legal.age": 30, - "address": { - "city": "New York City", - "zip:us": 11215 - } - }""", - """{ - "name": "Bob", - "legal.age": 32, - "address": { - "city": "Las Vegas", - "zip:us": 89101 - } - }""", - ] - tbl = pa.table({"jsons": pa.array(vals, pa.json_())}) - ds = lance.write_dataset(tbl, "memory://test") - - ds.create_scalar_index( - "jsons", - index_type="INVERTED", - lance_tokenizer="json", - ) - result = ds.to_table( - columns=["jsons"], full_text_query=MatchQuery("legal.age,number,30", "jsons") - ) - assert result.num_rows == 2 - - result = ds.to_table( - columns=["jsons"], - full_text_query=MatchQuery("address.zip:us,number,89101", "jsons"), - ) - assert result.num_rows == 1 - - result = ds.to_table( - columns=["jsons"], - full_text_query=MatchQuery("address.city,str,Francisco", "jsons"), - ) - assert result.num_rows == 1 - - def test_json_index(): vals = ['{"x": 7, "y": 10}', '{"x": 11, "y": 22}', '{"y": 0}', '{"x": 10}'] tbl = pa.table({"jsons": pa.array(vals, pa.json_())}) diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 49b275b8e5a..c9c56c95673 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -1742,9 +1742,6 @@ impl Dataset { if let Some(with_position) = kwargs.get_item("with_position")? { params = params.with_position(with_position.extract()?); } - if let Some(lance_tokenizer) = kwargs.get_item("lance_tokenizer")? { - params = params.lance_tokenizer(lance_tokenizer.extract()?); - } if let Some(base_tokenizer) = kwargs.get_item("base_tokenizer")? { params = params.base_tokenizer(base_tokenizer.extract()?); } From edb845fad782092fcd7cb1d1f716948283aa0a71 Mon Sep 17 00:00:00 2001 From: lijinglun Date: Mon, 27 Oct 2025 16:36:08 +0800 Subject: [PATCH 4/4] refactor --- docs/src/format/table/index/scalar/fts.md | 123 +++++++++++----------- 1 file changed, 61 insertions(+), 62 deletions(-) diff --git a/docs/src/format/table/index/scalar/fts.md b/docs/src/format/table/index/scalar/fts.md index b8bd48393ef..5af36d294b8 100644 --- a/docs/src/format/table/index/scalar/fts.md +++ b/docs/src/format/table/index/scalar/fts.md @@ -54,7 +54,6 @@ The metadata file contains JSON-serialized configuration and partition informati | Field | Type | Default | Description | |---------------------|---------|-----------|----------------------------------------------------------------| -| `lance_tokenizer` | String | None | Tokenizer for document type (see Tokenizers section) | | `base_tokenizer` | String | "simple" | Base tokenizer type (see Tokenizers section) | | `language` | String | "English" | Language for stemming and stop words | | `with_position` | Boolean | false | Store term positions for phrase queries (increases index size) | @@ -69,67 +68,7 @@ The metadata file contains JSON-serialized configuration and partition informati ## Tokenizers -There are two tokenizers in InvertedIndexParams: `lance_tokenizer` and `base_tokenizer`. The `lance_tokenizer` is -responsible for parsing different document types, such as text and json. It will be auto-inferred based on the -document type, user shouldn't specify it manually. See [Lance Tokenizers](#lance-tokenizers). - -The `base_tokenizer` is responsible for tokenizing documents. It supports multiple types for different text processing -needs, see [Base Tokenizers](#base-tokenizers). - -### Lance Tokenizers -| Tokenizer | Description | Use Case | -|-----------|----------------------------------------------------------------------|-----------------| -| **text** | Parse TEXT document into tokens. | Text document | -| **json** | Parse JSON document into tokens in triplet format `path,type,value`. | Json document | - -#### Text Tokenizer -Text Tokenizer is responsible for handling TEXT-type data, which is Utf8, LargeUtf8 or List of them in arrow format. - -Text Tokenizer applies the same rules when tokenizing query statements and documents. This rule may not necessarily -apply to other document types. As document types become more complex, tokenizer might use different rules for query -tokenization and document tokenization, such as the JSON Tokenizer shown below. - -#### Json Tokenizer -The Json Tokenizer is responsible for handling JSON-type data, which is the JSON type in arrow format. -Unlike Text Tokenizer, the Json Tokenizer behaves differently in query and document parsing scenarios. - -JSON is a nested structure, a JSON document can always be converted into triplets in format: `path,type,value`. That's -how lance handle JSON during document parsing, breaking down JSON document into tokens in the triplet format. -In scenarios where the triplet value is a text, the text value will be further tokenized using the base_tokenizer, -resulting in multiple triplet tokens. - -During querying, the Json Tokenizer uses the triplet format instead of the JSON format, which simplifies the query -syntax. - -The example below shows how the Json Tokenizer works. Assume we have the following JSON document: -```json -{ - "name": "Lance", - "legal.age": 30, - "address": { - "city": "San Francisco", - "zip:us": 94102 - } -} -``` - -After parsing, the document will be tokenized into the following tokens: -``` -name,str,Lance -legal.age,number,30 -address.city,str,San -address.city,str,Francisco -address.zip:us,number,94102 -``` - -Then we do full text search in triplet format. To search for "San Francisco," we can search with one of the triplets -below: -``` -address.city:San Francisco -address.city:San -address.city:Francisco -``` - +The full text search index supports multiple tokenizer types for different text processing needs: ### Base Tokenizers @@ -190,6 +129,66 @@ Token filters are applied in sequence after the base tokenizer: For stemming and stop word removal, the following languages are supported: Arabic, Danish, Dutch, English, Finnish, French, German, Greek, Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, Spanish, Swedish, Tamil, Turkish +## Document Type +Lance supports 2 kinds of documents: text and json. Different document types have different tokenization rules, and +parse tokens in different format. + +### Text Type +Text type includes text and list of text. Tokens are generated by base_tokenizer. + +The example below shows how text document is parsed into tokens. +```text +Tom lives in San Francisco. +``` + +The tokens are below. +```text +Tom +lives +in +San +Francisco +``` + +### Json Type +Json is a nested structure, lance breaks down json document into tokens in triplet format `path,type,value`. The valid +types are: str, number, bool, null. + +In scenarios where the triplet value is a str, the text value will be further tokenized using the base_tokenizer, +resulting in multiple triplet tokens. + +During querying, the Json Tokenizer uses the triplet format instead of the json format, which simplifies the query +syntax. + +The example below shows how the json document is tokenized. Assume we have the following json document: +```json +{ + "name": "Lance", + "legal.age": 30, + "address": { + "city": "San Francisco", + "zip:us": 94102 + } +} +``` + +After parsing, the document will be tokenized into the following tokens: +``` +name,str,Lance +legal.age,number,30 +address.city,str,San +address.city,str,Francisco +address.zip:us,number,94102 +``` + +Then we do full text search in triplet format. To search for "San Francisco," we can search with one of the triplets +below: +``` +address.city:San Francisco +address.city:San +address.city:Francisco +``` + ## Accelerated Queries Lance SDKs provide dedicated full text search APIs to leverage the FTS index capabilities.