From 54304b4c5077d7881723a425c010fbed9d4eb40f Mon Sep 17 00:00:00 2001 From: rohan Date: Wed, 4 Mar 2026 15:33:01 +0530 Subject: [PATCH] feat: normalize Unicode in token system for consistent port matching Apply NFC normalization before lowercasing in tokenize() so that equivalent Unicode representations produce identical tokens. Closes #96 --- packages/gds-framework/gds/types/tokens.py | 13 +++++++--- packages/gds-framework/tests/test_types.py | 29 ++++++++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/packages/gds-framework/gds/types/tokens.py b/packages/gds-framework/gds/types/tokens.py index 82f438d..52660e5 100644 --- a/packages/gds-framework/gds/types/tokens.py +++ b/packages/gds-framework/gds/types/tokens.py @@ -7,18 +7,23 @@ from __future__ import annotations +import unicodedata + def tokenize(signature: str) -> frozenset[str]: """Tokenize a signature string into a normalized frozen set of tokens. Splitting rules (applied in order): - 1. Split on ' + ' (the compound-type joiner). - 2. Split each part on ', ' (comma-space). - 3. Strip whitespace and lowercase each token. - 4. Discard empty strings. + 1. Apply Unicode NFC normalization (so that e.g. é as base+combining + matches precomposed é). + 2. Split on ' + ' (the compound-type joiner). + 3. Split each part on ', ' (comma-space). + 4. Strip whitespace and lowercase each token. + 5. Discard empty strings. """ if not signature: return frozenset() + signature = unicodedata.normalize("NFC", signature) tokens: set[str] = set() for plus_part in signature.split(" + "): for comma_part in plus_part.split(", "): diff --git a/packages/gds-framework/tests/test_types.py b/packages/gds-framework/tests/test_types.py index 8e914a2..9bcd667 100644 --- a/packages/gds-framework/tests/test_types.py +++ b/packages/gds-framework/tests/test_types.py @@ -45,6 +45,35 @@ def test_whitespace_stripped(self): def test_case_normalization(self): assert tokenize("TEMPERATURE") == frozenset({"temperature"}) + def test_unicode_nfc_normalization(self): + """NFC and NFD forms of the same character produce identical tokens.""" + import unicodedata + + nfc = unicodedata.normalize("NFC", "Température") # precomposed é + nfd = unicodedata.normalize("NFD", "Température") # base e + combining accent + assert nfc != nfd # different byte sequences + assert tokenize(nfc) == tokenize(nfd) + + def test_unicode_accented_overlap(self): + """Accented tokens match across NFC/NFD encodings in overlap checks.""" + import unicodedata + + nfc = unicodedata.normalize("NFC", "Vélocité") + nfd = unicodedata.normalize("NFD", "Vélocité") + assert tokens_overlap(nfc, nfd) is True + + def test_unicode_accented_subset(self): + """Accented tokens match across NFC/NFD encodings in subset checks.""" + import unicodedata + + nfc = unicodedata.normalize("NFC", "Résistance") + nfd = unicodedata.normalize("NFD", "Résistance + Capacitance") + assert tokens_subset(nfc, nfd) is True + + def test_unicode_plain_ascii_unaffected(self): + """NFC normalization is a no-op for plain ASCII strings.""" + assert tokenize("Temperature") == frozenset({"temperature"}) + # ── tokens_subset() ─────────────────────────────────────────