From 54304b4c5077d7881723a425c010fbed9d4eb40f Mon Sep 17 00:00:00 2001
From: rohan <e4rohan@gmail.com>
Date: Wed, 4 Mar 2026 15:33:01 +0530
Subject: [PATCH] feat: normalize Unicode in token system for consistent port
 matching

Apply NFC normalization before lowercasing in tokenize() so that
equivalent Unicode representations produce identical tokens.

Closes #96
---
 packages/gds-framework/gds/types/tokens.py | 13 +++++++---
 packages/gds-framework/tests/test_types.py | 29 ++++++++++++++++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/packages/gds-framework/gds/types/tokens.py b/packages/gds-framework/gds/types/tokens.py
index 82f438d..52660e5 100644
--- a/packages/gds-framework/gds/types/tokens.py
+++ b/packages/gds-framework/gds/types/tokens.py
@@ -7,18 +7,23 @@
 
 from __future__ import annotations
 
+import unicodedata
+
 
 def tokenize(signature: str) -> frozenset[str]:
     """Tokenize a signature string into a normalized frozen set of tokens.
 
     Splitting rules (applied in order):
-    1. Split on ' + ' (the compound-type joiner).
-    2. Split each part on ', ' (comma-space).
-    3. Strip whitespace and lowercase each token.
-    4. Discard empty strings.
+    1. Apply Unicode NFC normalization (so that e.g. é as base+combining
+       matches precomposed é).
+    2. Split on ' + ' (the compound-type joiner).
+    3. Split each part on ', ' (comma-space).
+    4. Strip whitespace and lowercase each token.
+    5. Discard empty strings.
     """
     if not signature:
         return frozenset()
+    signature = unicodedata.normalize("NFC", signature)
     tokens: set[str] = set()
     for plus_part in signature.split(" + "):
         for comma_part in plus_part.split(", "):
diff --git a/packages/gds-framework/tests/test_types.py b/packages/gds-framework/tests/test_types.py
index 8e914a2..9bcd667 100644
--- a/packages/gds-framework/tests/test_types.py
+++ b/packages/gds-framework/tests/test_types.py
@@ -45,6 +45,35 @@ def test_whitespace_stripped(self):
     def test_case_normalization(self):
         assert tokenize("TEMPERATURE") == frozenset({"temperature"})
 
+    def test_unicode_nfc_normalization(self):
+        """NFC and NFD forms of the same character produce identical tokens."""
+        import unicodedata
+
+        nfc = unicodedata.normalize("NFC", "Température")  # precomposed é
+        nfd = unicodedata.normalize("NFD", "Température")  # base e + combining accent
+        assert nfc != nfd  # different byte sequences
+        assert tokenize(nfc) == tokenize(nfd)
+
+    def test_unicode_accented_overlap(self):
+        """Accented tokens match across NFC/NFD encodings in overlap checks."""
+        import unicodedata
+
+        nfc = unicodedata.normalize("NFC", "Vélocité")
+        nfd = unicodedata.normalize("NFD", "Vélocité")
+        assert tokens_overlap(nfc, nfd) is True
+
+    def test_unicode_accented_subset(self):
+        """Accented tokens match across NFC/NFD encodings in subset checks."""
+        import unicodedata
+
+        nfc = unicodedata.normalize("NFC", "Résistance")
+        nfd = unicodedata.normalize("NFD", "Résistance + Capacitance")
+        assert tokens_subset(nfc, nfd) is True
+
+    def test_unicode_plain_ascii_unaffected(self):
+        """NFC normalization is a no-op for plain ASCII strings."""
+        assert tokenize("Temperature") == frozenset({"temperature"})
+
 
 # ── tokens_subset() ─────────────────────────────────────────