From e61b0ce43b826fdd91e029090161ca2c054cd02e Mon Sep 17 00:00:00 2001
From: Ivan Despot <66276597+g-despot@users.noreply.github.com>
Date: Thu, 30 Apr 2026 17:40:14 +0200
Subject: [PATCH 1/3] docs: fix tokenize API guide; bump CI to 1.37.2

- TOKENIZE_API_USAGE.md: correct Tokenize.Text signature (adds stopwords
  param, fixes stopwordPresets type to IDictionary<string, IList<string>>),
  rewrite stopwords examples to use flat word lists, fix Property samples
  (DataType, PropertyTokenization, Use vs Get), drop fictional result
  fields (Tokenization / AnalyzerConfig / StopwordConfig) and fix the
  Result Shape table to match the actual record (Indexed + Query only).
- TestTokenize.Tokenization_Enum: pass stopwords: None explicitly so the
  test no longer depends on server defaults (1.37.2 auto-applies the EN
  preset for Word tokenization).
- CI: run on 1.37.2 instead of 1.37.1.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/main.yaml                   |  2 +-
 docs/TOKENIZE_API_USAGE.md                    | 77 +++++++++----------
 .../Integration/TestTokenize.cs               |  4 +
 3 files changed, 42 insertions(+), 41 deletions(-)
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 8e7e446e..3a5b5815 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -135,7 +135,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: ["1.32.27", "1.33.18", "1.34.20", "1.35.16", "1.36.10", "1.37.1"]
+        version: ["1.32.27", "1.33.18", "1.34.20", "1.35.16", "1.36.10", "1.37.2"]
     uses: ./.github/workflows/test-on-weaviate-version.yml
     secrets: inherit
     with:
diff --git a/docs/TOKENIZE_API_USAGE.md b/docs/TOKENIZE_API_USAGE.md
index 08bcb8a8..82257591 100644
--- a/docs/TOKENIZE_API_USAGE.md
+++ b/docs/TOKENIZE_API_USAGE.md
@@ -46,9 +46,9 @@ The `PropertyTokenization` enum covers all nine server-supported strategies:
 | `Field` | `"  Hello World  "` | `["Hello World"]` *(entire field, trimmed)* |
 | `Trigram` | `"Hello"` | `["hel", "ell", "llo"]` |
 | `Gse` | Chinese/Japanese | Requires `ENABLE_TOKENIZER_GSE=true` on the server |
-| `GseCh` | Chinese-only GSE | Requires `ENABLE_TOKENIZER_GSE_CH=true` |
-| `KagomeJa` | Japanese | Requires `ENABLE_TOKENIZER_KAGOME_JA=true` |
-| `KagomeKr` | Korean | Requires `ENABLE_TOKENIZER_KAGOME_KR=true` |
+| `Gse_ch` | Chinese-only GSE | Requires `ENABLE_TOKENIZER_GSE_CH=true` |
+| `Kagome_ja` | Japanese | Requires `ENABLE_TOKENIZER_KAGOME_JA=true` |
+| `Kagome_kr` | Korean | Requires `ENABLE_TOKENIZER_KAGOME_KR=true` |
 
 ## Ad-hoc Tokenization (`client.Tokenize.Text`)
 
@@ -73,24 +73,26 @@ Task<TokenizeResult> Tokenize.Text(
     string text,
     PropertyTokenization tokenization,
     TextAnalyzerConfig? analyzerConfig = null,
-    IDictionary<string, StopwordConfig>? stopwordPresets = null,
+    StopwordConfig? stopwords = null,
+    IDictionary<string, IList<string>>? stopwordPresets = null,
     CancellationToken cancellationToken = default
 );
 ```
 
+`stopwords` and `stopwordPresets` are mutually exclusive — passing both throws `ArgumentException`.
+
 ## Property-scoped Tokenization (`collection.Tokenize.Property`)
 
 When you want to see how a specific property would tokenize text — using that property's configured tokenization — use the collection-scoped variant:
 
 ```csharp
-var collection = await client.Collections.Get("Article");
+var collection = client.Collections.Use("Article");
 
 var result = await collection.Tokenize.Property(
     propertyName: "title",
     text: "  Hello World  "
 );
 
-Console.WriteLine(result.Tokenization);          // Field (whatever the property is configured with)
 Console.WriteLine(string.Join(", ", result.Indexed)); // Hello World
 ```
 
@@ -155,20 +157,21 @@ var result = await client.Tokenize.Text(
 
 ## Stopwords
 
-For more control, define a named preset via the `stopwordPresets` dictionary and reference it from `StopwordPreset`.
+There are two ways to feed stopwords into a tokenize call:
+
+1. **`stopwordPresets`** — a `name → word-list` dictionary. Each value is a flat list of stopwords for that preset. `TextAnalyzerConfig.StopwordPreset` then references one by name. A preset name that matches a built-in (`"en"`, `"none"`) replaces the built-in for this call.
+2. **`stopwords`** — a one-off `StopwordConfig` (`preset` + `additions` + `removals`) applied directly. Mirrors the collection-level `invertedIndexConfig.stopwords` shape.
+
+The two parameters are **mutually exclusive** — pass one or the other.
 
-### Add words to a preset
+### Custom named preset
 
 ```csharp
 var cfg = new TextAnalyzerConfig { StopwordPreset = "custom" };
 
-var presets = new Dictionary<string, StopwordConfig>
+var presets = new Dictionary<string, IList<string>>
 {
-    ["custom"] = new StopwordConfig
-    {
-        Preset = StopwordConfig.Presets.None,
-        Additions = ["test"],
-    },
+    ["custom"] = new[] { "test" },
 };
 
 var result = await client.Tokenize.Text(
@@ -182,28 +185,24 @@ var result = await client.Tokenize.Text(
 // result.Query   → ["hello", "world"]          ("test" dropped)
 ```
 
-### Start from a base preset and remove words
+### One-off `stopwords` block
 
-```csharp
-var cfg = new TextAnalyzerConfig { StopwordPreset = "en-no-the" };
-
-var presets = new Dictionary<string, StopwordConfig>
-{
-    ["en-no-the"] = new StopwordConfig
-    {
-        Preset = StopwordConfig.Presets.EN,
-        Removals = ["the"],
-    },
-};
+Use `stopwords` when you want a base preset plus tweaks for a single call without defining a named preset:
 
+```csharp
 var result = await client.Tokenize.Text(
     "the quick",
     PropertyTokenization.Word,
-    analyzerConfig: cfg,
-    stopwordPresets: presets
+    stopwords: new StopwordConfig
+    {
+        Preset = StopwordConfig.Presets.EN,
+        Removals = ["the"],
+    }
 );
 
-// "the" is no longer a stopword in this preset, so it survives in both lists.
+// "the" was removed from the EN base, so it survives in both lists:
+// result.Indexed → ["the", "quick"]
+// result.Query   → ["the", "quick"]
 ```
 
 ### Combining folding and stopwords
@@ -227,17 +226,14 @@ var result = await client.Tokenize.Text(
 
 ## Result Shape
 
-`TokenizeResult` is a sealed record:
+`TokenizeResult` is a sealed record with two members:
 
 | Member | Type | Description |
 |---|---|---|
-| `Tokenization` | `PropertyTokenization` | The method that was applied. |
 | `Indexed` | `ImmutableList<string>` | Tokens as stored in the inverted index. |
 | `Query` | `ImmutableList<string>` | Tokens used at query time (after stopword removal). |
-| `AnalyzerConfig` | `TextAnalyzerConfig?` | Echo of the analyzer config that was applied, or `null`. |
-| `StopwordConfig` | `StopwordConfig?` | Echo of the resolved stopword config, or `null`. |
 
-The `AnalyzerConfig` echo is the server's view of what was applied — useful for verifying that your config was parsed correctly. The round-trip also normalizes wire-format quirks (the server represents `asciiFold` as a `bool` + separate `asciiFoldIgnore[]`, but the client unwraps it back into the nested `AsciiFoldConfig` record).
+The two lists differ when stopwords are configured: stopwords stay in `Indexed` (so BM25 can count document length) but are dropped from `Query` so they don't inflate match scores.
 
 ## Property-level Text Analyzer (schema)
 
@@ -252,8 +248,8 @@ await client.Collections.Create(new CollectionCreateParams
         new Property
         {
             Name = "title",
-            DataType = [DataType.Text],
-            Tokenization = PropertyTokenization.Word,
+            DataType = DataType.Text,
+            PropertyTokenization = PropertyTokenization.Word,
             TextAnalyzer = new TextAnalyzerConfig
             {
                 AsciiFold = new AsciiFoldConfig(),
@@ -289,7 +285,8 @@ await client.Collections.Create(new CollectionCreateParams
         new Property
         {
             Name = "body",
-            DataType = [DataType.Text],
+            DataType = DataType.Text,
+            PropertyTokenization = PropertyTokenization.Word,
             TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" },
         },
     ],
@@ -336,7 +333,7 @@ var docTokens   = (await collection.Tokenize.Property("body", "I was running")).
 
 ### Verifying analyzer config round-trip
 
-When you configure ASCII folding or a stopword preset, the server echoes back its interpretation on every call:
+Pass the analyzer config to `Tokenize.Text` and check the tokens it returns:
 
 ```csharp
 var cfg = new TextAnalyzerConfig
@@ -347,6 +344,6 @@ var cfg = new TextAnalyzerConfig
 
 var result = await client.Tokenize.Text("L'école", PropertyTokenization.Word, analyzerConfig: cfg);
 
-Debug.Assert(result.AnalyzerConfig!.AsciiFold!.Ignore!.SequenceEqual(new[] { "é" }));
-Debug.Assert(result.AnalyzerConfig.StopwordPreset == "en");
+// AsciiFold is on, but "é" is in Ignore → "école" survives intact.
+Debug.Assert(result.Indexed.Contains("école"));
 ```
diff --git a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs
index 28da5647..4c175f56 100644
--- a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs
+++ b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs
@@ -47,9 +47,13 @@ string[] expectedTokens
     {
         RequireVersion<TokenizeClient>(nameof(TokenizeClient.Text));
 
+        // Disable stopwords explicitly: from 1.37.2 onward, Word tokenization
+        // defaults to the EN preset when no stopwords config is supplied, which
+        // would strip "the" from both lists and break the assertions below.
         var result = await _weaviate.Tokenize.Text(
             text,
             tokenization,
+            stopwords: new StopwordConfig { Preset = StopwordConfig.Presets.None },
             cancellationToken: TestContext.Current.CancellationToken
         );
 

From 7d71102e3170b85d554b43d802d8c4cac7e625f1 Mon Sep 17 00:00:00 2001
From: Ivan Despot <66276597+g-despot@users.noreply.github.com>
Date: Thu, 30 Apr 2026 17:42:25 +0200
Subject: [PATCH 2/3] docs: drop unverified version claim in TestTokenize
 comment

The "from 1.37.2 onward" wording suggested the default-EN behavior was
introduced at that version; I haven't verified that against earlier
releases. The openapi spec just documents it as the current default.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/Weaviate.Client.Tests/Integration/TestTokenize.cs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs
index 4c175f56..df0e531c 100644
--- a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs
+++ b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs
@@ -47,9 +47,9 @@ string[] expectedTokens
     {
         RequireVersion<TokenizeClient>(nameof(TokenizeClient.Text));
 
-        // Disable stopwords explicitly: from 1.37.2 onward, Word tokenization
-        // defaults to the EN preset when no stopwords config is supplied, which
-        // would strip "the" from both lists and break the assertions below.
+        // Disable stopwords explicitly: when no stopwords config is supplied,
+        // the server defaults Word tokenization to the EN preset, which strips
+        // "the" from both lists and would break the assertions below.
         var result = await _weaviate.Tokenize.Text(
             text,
             tokenization,

From 7ef9aa67b9d856177dc4028448fb5bfc11997beb Mon Sep 17 00:00:00 2001
From: Ivan Despot <66276597+g-despot@users.noreply.github.com>
Date: Thu, 30 Apr 2026 17:45:01 +0200
Subject: [PATCH 3/3] docs: correct TestTokenize comment (Query, not Indexed)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Verified locally against 1.37.2 with both dotnet test and a raw curl
to /v1/tokenize: the EN-preset default strips "the" from result.Query
only — result.Indexed always keeps it. The earlier comment said both
lists were affected, which was wrong.
---
 src/Weaviate.Client.Tests/Integration/TestTokenize.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs
index df0e531c..d2c5d44f 100644
--- a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs
+++ b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs
@@ -49,7 +49,7 @@ string[] expectedTokens
 
         // Disable stopwords explicitly: when no stopwords config is supplied,
         // the server defaults Word tokenization to the EN preset, which strips
-        // "the" from both lists and would break the assertions below.
+        // "the" from result.Query and would break the assertion below.
         var result = await _weaviate.Tokenize.Text(
             text,
             tokenization,