From e61b0ce43b826fdd91e029090161ca2c054cd02e Mon Sep 17 00:00:00 2001 From: Ivan Despot <66276597+g-despot@users.noreply.github.com> Date: Thu, 30 Apr 2026 17:40:14 +0200 Subject: [PATCH 1/3] docs: fix tokenize API guide; bump CI to 1.37.2 - TOKENIZE_API_USAGE.md: correct Tokenize.Text signature (adds stopwords param, fixes stopwordPresets type to IDictionary>), rewrite stopwords examples to use flat word lists, fix Property samples (DataType, PropertyTokenization, Use vs Get), drop fictional result fields (Tokenization / AnalyzerConfig / StopwordConfig) and fix the Result Shape table to match the actual record (Indexed + Query only). - TestTokenize.Tokenization_Enum: pass stopwords: None explicitly so the test no longer depends on server defaults (1.37.2 auto-applies the EN preset for Word tokenization). - CI: run on 1.37.2 instead of 1.37.1. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/main.yaml | 2 +- docs/TOKENIZE_API_USAGE.md | 77 +++++++++---------- .../Integration/TestTokenize.cs | 4 + 3 files changed, 42 insertions(+), 41 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 8e7e446e..3a5b5815 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -135,7 +135,7 @@ jobs: strategy: fail-fast: false matrix: - version: ["1.32.27", "1.33.18", "1.34.20", "1.35.16", "1.36.10", "1.37.1"] + version: ["1.32.27", "1.33.18", "1.34.20", "1.35.16", "1.36.10", "1.37.2"] uses: ./.github/workflows/test-on-weaviate-version.yml secrets: inherit with: diff --git a/docs/TOKENIZE_API_USAGE.md b/docs/TOKENIZE_API_USAGE.md index 08bcb8a8..82257591 100644 --- a/docs/TOKENIZE_API_USAGE.md +++ b/docs/TOKENIZE_API_USAGE.md @@ -46,9 +46,9 @@ The `PropertyTokenization` enum covers all nine server-supported strategies: | `Field` | `" Hello World "` | `["Hello World"]` *(entire field, trimmed)* | | `Trigram` | `"Hello"` | `["hel", "ell", "llo"]` | | `Gse` | Chinese/Japanese | Requires `ENABLE_TOKENIZER_GSE=true` on the server | -| `GseCh` | Chinese-only GSE | Requires `ENABLE_TOKENIZER_GSE_CH=true` | -| `KagomeJa` | Japanese | Requires `ENABLE_TOKENIZER_KAGOME_JA=true` | -| `KagomeKr` | Korean | Requires `ENABLE_TOKENIZER_KAGOME_KR=true` | +| `Gse_ch` | Chinese-only GSE | Requires `ENABLE_TOKENIZER_GSE_CH=true` | +| `Kagome_ja` | Japanese | Requires `ENABLE_TOKENIZER_KAGOME_JA=true` | +| `Kagome_kr` | Korean | Requires `ENABLE_TOKENIZER_KAGOME_KR=true` | ## Ad-hoc Tokenization (`client.Tokenize.Text`) @@ -73,24 +73,26 @@ Task Tokenize.Text( string text, PropertyTokenization tokenization, TextAnalyzerConfig? analyzerConfig = null, - IDictionary? stopwordPresets = null, + StopwordConfig? stopwords = null, + IDictionary>? stopwordPresets = null, CancellationToken cancellationToken = default ); ``` +`stopwords` and `stopwordPresets` are mutually exclusive — passing both throws `ArgumentException`. + ## Property-scoped Tokenization (`collection.Tokenize.Property`) When you want to see how a specific property would tokenize text — using that property's configured tokenization — use the collection-scoped variant: ```csharp -var collection = await client.Collections.Get("Article"); +var collection = client.Collections.Use("Article"); var result = await collection.Tokenize.Property( propertyName: "title", text: " Hello World " ); -Console.WriteLine(result.Tokenization); // Field (whatever the property is configured with) Console.WriteLine(string.Join(", ", result.Indexed)); // Hello World ``` @@ -155,20 +157,21 @@ var result = await client.Tokenize.Text( ## Stopwords -For more control, define a named preset via the `stopwordPresets` dictionary and reference it from `StopwordPreset`. +There are two ways to feed stopwords into a tokenize call: + +1. **`stopwordPresets`** — a `name → word-list` dictionary. Each value is a flat list of stopwords for that preset. `TextAnalyzerConfig.StopwordPreset` then references one by name. A preset name that matches a built-in (`"en"`, `"none"`) replaces the built-in for this call. +2. **`stopwords`** — a one-off `StopwordConfig` (`preset` + `additions` + `removals`) applied directly. Mirrors the collection-level `invertedIndexConfig.stopwords` shape. + +The two parameters are **mutually exclusive** — pass one or the other. -### Add words to a preset +### Custom named preset ```csharp var cfg = new TextAnalyzerConfig { StopwordPreset = "custom" }; -var presets = new Dictionary +var presets = new Dictionary> { - ["custom"] = new StopwordConfig - { - Preset = StopwordConfig.Presets.None, - Additions = ["test"], - }, + ["custom"] = new[] { "test" }, }; var result = await client.Tokenize.Text( @@ -182,28 +185,24 @@ var result = await client.Tokenize.Text( // result.Query → ["hello", "world"] ("test" dropped) ``` -### Start from a base preset and remove words +### One-off `stopwords` block -```csharp -var cfg = new TextAnalyzerConfig { StopwordPreset = "en-no-the" }; - -var presets = new Dictionary -{ - ["en-no-the"] = new StopwordConfig - { - Preset = StopwordConfig.Presets.EN, - Removals = ["the"], - }, -}; +Use `stopwords` when you want a base preset plus tweaks for a single call without defining a named preset: +```csharp var result = await client.Tokenize.Text( "the quick", PropertyTokenization.Word, - analyzerConfig: cfg, - stopwordPresets: presets + stopwords: new StopwordConfig + { + Preset = StopwordConfig.Presets.EN, + Removals = ["the"], + } ); -// "the" is no longer a stopword in this preset, so it survives in both lists. +// "the" was removed from the EN base, so it survives in both lists: +// result.Indexed → ["the", "quick"] +// result.Query → ["the", "quick"] ``` ### Combining folding and stopwords @@ -227,17 +226,14 @@ var result = await client.Tokenize.Text( ## Result Shape -`TokenizeResult` is a sealed record: +`TokenizeResult` is a sealed record with two members: | Member | Type | Description | |---|---|---| -| `Tokenization` | `PropertyTokenization` | The method that was applied. | | `Indexed` | `ImmutableList` | Tokens as stored in the inverted index. | | `Query` | `ImmutableList` | Tokens used at query time (after stopword removal). | -| `AnalyzerConfig` | `TextAnalyzerConfig?` | Echo of the analyzer config that was applied, or `null`. | -| `StopwordConfig` | `StopwordConfig?` | Echo of the resolved stopword config, or `null`. | -The `AnalyzerConfig` echo is the server's view of what was applied — useful for verifying that your config was parsed correctly. The round-trip also normalizes wire-format quirks (the server represents `asciiFold` as a `bool` + separate `asciiFoldIgnore[]`, but the client unwraps it back into the nested `AsciiFoldConfig` record). +The two lists differ when stopwords are configured: stopwords stay in `Indexed` (so BM25 can count document length) but are dropped from `Query` so they don't inflate match scores. ## Property-level Text Analyzer (schema) @@ -252,8 +248,8 @@ await client.Collections.Create(new CollectionCreateParams new Property { Name = "title", - DataType = [DataType.Text], - Tokenization = PropertyTokenization.Word, + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, TextAnalyzer = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig(), @@ -289,7 +285,8 @@ await client.Collections.Create(new CollectionCreateParams new Property { Name = "body", - DataType = [DataType.Text], + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, }, ], @@ -336,7 +333,7 @@ var docTokens = (await collection.Tokenize.Property("body", "I was running")). ### Verifying analyzer config round-trip -When you configure ASCII folding or a stopword preset, the server echoes back its interpretation on every call: +Pass the analyzer config to `Tokenize.Text` and check the tokens it returns: ```csharp var cfg = new TextAnalyzerConfig @@ -347,6 +344,6 @@ var cfg = new TextAnalyzerConfig var result = await client.Tokenize.Text("L'école", PropertyTokenization.Word, analyzerConfig: cfg); -Debug.Assert(result.AnalyzerConfig!.AsciiFold!.Ignore!.SequenceEqual(new[] { "é" })); -Debug.Assert(result.AnalyzerConfig.StopwordPreset == "en"); +// AsciiFold is on, but "é" is in Ignore → "école" survives intact. +Debug.Assert(result.Indexed.Contains("école")); ``` diff --git a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs index 28da5647..4c175f56 100644 --- a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs +++ b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs @@ -47,9 +47,13 @@ string[] expectedTokens { RequireVersion(nameof(TokenizeClient.Text)); + // Disable stopwords explicitly: from 1.37.2 onward, Word tokenization + // defaults to the EN preset when no stopwords config is supplied, which + // would strip "the" from both lists and break the assertions below. var result = await _weaviate.Tokenize.Text( text, tokenization, + stopwords: new StopwordConfig { Preset = StopwordConfig.Presets.None }, cancellationToken: TestContext.Current.CancellationToken ); From 7d71102e3170b85d554b43d802d8c4cac7e625f1 Mon Sep 17 00:00:00 2001 From: Ivan Despot <66276597+g-despot@users.noreply.github.com> Date: Thu, 30 Apr 2026 17:42:25 +0200 Subject: [PATCH 2/3] docs: drop unverified version claim in TestTokenize comment The "from 1.37.2 onward" wording suggested the default-EN behavior was introduced at that version; I haven't verified that against earlier releases. The openapi spec just documents it as the current default. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/Weaviate.Client.Tests/Integration/TestTokenize.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs index 4c175f56..df0e531c 100644 --- a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs +++ b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs @@ -47,9 +47,9 @@ string[] expectedTokens { RequireVersion(nameof(TokenizeClient.Text)); - // Disable stopwords explicitly: from 1.37.2 onward, Word tokenization - // defaults to the EN preset when no stopwords config is supplied, which - // would strip "the" from both lists and break the assertions below. + // Disable stopwords explicitly: when no stopwords config is supplied, + // the server defaults Word tokenization to the EN preset, which strips + // "the" from both lists and would break the assertions below. var result = await _weaviate.Tokenize.Text( text, tokenization, From 7ef9aa67b9d856177dc4028448fb5bfc11997beb Mon Sep 17 00:00:00 2001 From: Ivan Despot <66276597+g-despot@users.noreply.github.com> Date: Thu, 30 Apr 2026 17:45:01 +0200 Subject: [PATCH 3/3] docs: correct TestTokenize comment (Query, not Indexed) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verified locally against 1.37.2 with both dotnet test and a raw curl to /v1/tokenize: the EN-preset default strips "the" from result.Query only — result.Indexed always keeps it. The earlier comment said both lists were affected, which was wrong. --- src/Weaviate.Client.Tests/Integration/TestTokenize.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs index df0e531c..d2c5d44f 100644 --- a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs +++ b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs @@ -49,7 +49,7 @@ string[] expectedTokens // Disable stopwords explicitly: when no stopwords config is supplied, // the server defaults Word tokenization to the EN preset, which strips - // "the" from both lists and would break the assertions below. + // "the" from result.Query and would break the assertion below. var result = await _weaviate.Tokenize.Text( text, tokenization,