Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions packages/gds-framework/gds/types/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,23 @@

from __future__ import annotations

import unicodedata


def tokenize(signature: str) -> frozenset[str]:
"""Tokenize a signature string into a normalized frozen set of tokens.

Splitting rules (applied in order):
1. Split on ' + ' (the compound-type joiner).
2. Split each part on ', ' (comma-space).
3. Strip whitespace and lowercase each token.
4. Discard empty strings.
1. Apply Unicode NFC normalization (so that e.g. é as base+combining
matches precomposed é).
2. Split on ' + ' (the compound-type joiner).
3. Split each part on ', ' (comma-space).
4. Strip whitespace and lowercase each token.
5. Discard empty strings.
"""
if not signature:
return frozenset()
signature = unicodedata.normalize("NFC", signature)
tokens: set[str] = set()
for plus_part in signature.split(" + "):
for comma_part in plus_part.split(", "):
Expand Down
29 changes: 29 additions & 0 deletions packages/gds-framework/tests/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,35 @@ def test_whitespace_stripped(self):
def test_case_normalization(self):
assert tokenize("TEMPERATURE") == frozenset({"temperature"})

def test_unicode_nfc_normalization(self):
"""NFC and NFD forms of the same character produce identical tokens."""
import unicodedata

nfc = unicodedata.normalize("NFC", "Température") # precomposed é
nfd = unicodedata.normalize("NFD", "Température") # base e + combining accent
assert nfc != nfd # different byte sequences
assert tokenize(nfc) == tokenize(nfd)

def test_unicode_accented_overlap(self):
"""Accented tokens match across NFC/NFD encodings in overlap checks."""
import unicodedata

nfc = unicodedata.normalize("NFC", "Vélocité")
nfd = unicodedata.normalize("NFD", "Vélocité")
assert tokens_overlap(nfc, nfd) is True

def test_unicode_accented_subset(self):
"""Accented tokens match across NFC/NFD encodings in subset checks."""
import unicodedata

nfc = unicodedata.normalize("NFC", "Résistance")
nfd = unicodedata.normalize("NFD", "Résistance + Capacitance")
assert tokens_subset(nfc, nfd) is True

def test_unicode_plain_ascii_unaffected(self):
"""NFC normalization is a no-op for plain ASCII strings."""
assert tokenize("Temperature") == frozenset({"temperature"})


# ── tokens_subset() ─────────────────────────────────────────

Expand Down