From 28e1abd41cb6d1b00d0cad16552312cb8d2ce53f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Karata=C5=9F?= Date: Mon, 23 Mar 2026 20:25:06 +0300 Subject: [PATCH] fix: add FLAG num support for agglutinative languages The spell checker's affix parser used `rune` (single character) as the map key for affix rules. This broke dictionaries that use `FLAG num` format, where flags are comma-separated numbers (e.g., "14308,10482"). Only the first digit of each numeric flag was read, causing most affix rules to be unreachable. This affected all agglutinative languages (Turkish, Hungarian, Finnish, etc.) whose Hunspell dictionaries use `FLAG num` with tens of thousands of suffix groups. Changes: - Change AffixMap key from `rune` to `string` - Change compoundMap key from `rune` to `string` - Add parseFlags() method that handles ASCII, num, long, and UTF-8 formats - Update expand() to use parsed flag slices instead of rune iteration - Update compound rule parsing in gospell.go For the Turkish dictionary (tr_TR), this enables correct recognition of ~59,000 suffix groups and ~15.8M inflected word forms that were previously unreachable. --- internal/spell/aff.go | 56 ++++++++--- internal/spell/aff_test.go | 194 +++++++++++++++++++++++++++++++++++++ internal/spell/gospell.go | 17 ++-- 3 files changed, 244 insertions(+), 23 deletions(-) create mode 100644 internal/spell/aff_test.go diff --git a/internal/spell/aff.go b/internal/spell/aff.go index 32b365384..5f72cc22a 100644 --- a/internal/spell/aff.go +++ b/internal/spell/aff.go @@ -62,13 +62,39 @@ type dictConfig struct { TryChars string WordChars string CompoundOnly string - AffixMap map[rune]affix + AffixMap map[string]affix CamelCase int CompoundMin int64 - compoundMap map[rune][]string + compoundMap map[string][]string NoSuggestFlag string } +// parseFlags splits a flag string into individual flags based on the FLAG type. +// +// Hunspell supports several flag formats: +// - "ASCII" (default): each character is a flag +// - "num": flags are comma-separated numbers (e.g., "14308,10482,4720") +// - "UTF-8": each UTF-8 character is a flag +// - "long": each pair of ASCII characters is a flag +func (a dictConfig) parseFlags(flagStr string) []string { + switch a.Flag { + case "num": + return strings.Split(flagStr, ",") + case "long": + flags := make([]string, 0, len(flagStr)/2) + for i := 0; i+1 < len(flagStr); i += 2 { + flags = append(flags, flagStr[i:i+2]) + } + return flags + default: // "ASCII" or "UTF-8" + flags := make([]string, 0, len(flagStr)) + for _, r := range flagStr { + flags = append(flags, string(r)) + } + return flags + } +} + // expand expands a word/affix using dictionary/affix rules // // This also supports CompoundRule flags @@ -87,11 +113,13 @@ func (a dictConfig) expand(wordAffix string, out []string) ([]string, error) { // safe word, keyString := wordAffix[:idx], wordAffix[idx+1:] + flags := a.parseFlags(keyString) + // check to see if any of the flags are in the // "compound only". If so then nothing to add compoundOnly := false - for _, key := range keyString { - if strings.ContainsRune(a.CompoundOnly, key) { + for _, key := range flags { + if key == a.CompoundOnly { compoundOnly = true continue } @@ -110,12 +138,9 @@ func (a dictConfig) expand(wordAffix string, out []string) ([]string, error) { out = append(out, word) prefixes := make([]affix, 0, 5) suffixes := make([]affix, 0, 5) - for _, key := range keyString { - // want keyString to []?something? - // then iterate over that + for _, key := range flags { af, ok := a.AffixMap[key] if !ok { - // TODO: How should we handle this? continue } if !af.CrossProduct { @@ -161,8 +186,8 @@ func isCrossProduct(val string) (bool, error) { func newDictConfig(file io.Reader) (*dictConfig, error) { //nolint:funlen aff := dictConfig{ Flag: "ASCII", - AffixMap: make(map[rune]affix), - compoundMap: make(map[rune][]string), + AffixMap: make(map[string]affix), + compoundMap: make(map[string][]string), CompoundMin: 3, // default in Hunspell } scanner := bufio.NewScanner(file) @@ -219,9 +244,9 @@ func newDictConfig(file io.Reader) (*dictConfig, error) { //nolint:funlen aff.CompoundRule = make([]string, 0, val) } else { aff.CompoundRule = append(aff.CompoundRule, parts[1]) - for _, char := range parts[1] { - if _, ok := aff.compoundMap[char]; !ok { - aff.compoundMap[char] = []string{} + for _, flag := range aff.parseFlags(parts[1]) { + if _, ok := aff.compoundMap[flag]; !ok { + aff.compoundMap[flag] = []string{} } } } @@ -248,8 +273,7 @@ func newDictConfig(file io.Reader) (*dictConfig, error) { //nolint:funlen sections := len(parts) if sections > 4 { - // does this need to be split out into suffix and prefix? - flag := rune(parts[1][0]) + flag := parts[1] a, ok := aff.AffixMap[flag] if !ok { return nil, fmt.Errorf("got rules for flag %q but no definition", flag) @@ -299,7 +323,7 @@ func newDictConfig(file io.Reader) (*dictConfig, error) { //nolint:funlen Type: atype, CrossProduct: cross, } - flag := rune(parts[1][0]) + flag := parts[1] aff.AffixMap[flag] = a } default: diff --git a/internal/spell/aff_test.go b/internal/spell/aff_test.go new file mode 100644 index 000000000..5bc1cfb66 --- /dev/null +++ b/internal/spell/aff_test.go @@ -0,0 +1,194 @@ +package spell + +import ( + "strings" + "testing" +) + +func TestParseFlagsASCII(t *testing.T) { + dc := dictConfig{Flag: "ASCII"} + flags := dc.parseFlags("ABC") + if len(flags) != 3 || flags[0] != "A" || flags[1] != "B" || flags[2] != "C" { + t.Errorf("ASCII parseFlags(%q) = %v, want [A B C]", "ABC", flags) + } +} + +func TestParseFlagsNum(t *testing.T) { + dc := dictConfig{Flag: "num"} + flags := dc.parseFlags("14308,10482,4720") + if len(flags) != 3 || flags[0] != "14308" || flags[1] != "10482" || flags[2] != "4720" { + t.Errorf("num parseFlags(%q) = %v, want [14308 10482 4720]", "14308,10482,4720", flags) + } +} + +func TestParseFlagsLong(t *testing.T) { + dc := dictConfig{Flag: "long"} + flags := dc.parseFlags("AABB") + if len(flags) != 2 || flags[0] != "AA" || flags[1] != "BB" { + t.Errorf("long parseFlags(%q) = %v, want [AA BB]", "AABB", flags) + } +} + +func TestParseFlagsUTF8(t *testing.T) { + dc := dictConfig{Flag: "UTF-8"} + flags := dc.parseFlags("AğB") + if len(flags) != 3 || flags[0] != "A" || flags[1] != "ğ" || flags[2] != "B" { + t.Errorf("UTF-8 parseFlags(%q) = %v, want [A ğ B]", "AğB", flags) + } +} + +func TestFlagNumAffixParsing(t *testing.T) { + // Minimal FLAG num AFF file + affContent := `SET UTF-8 +FLAG num + +SFX 100 N 1 +SFX 100 0 ler . + +SFX 200 N 1 +SFX 200 0 in . +` + aff, err := newDictConfig(strings.NewReader(affContent)) + if err != nil { + t.Fatalf("newDictConfig error: %v", err) + } + + if aff.Flag != "num" { + t.Errorf("Flag = %q, want %q", aff.Flag, "num") + } + + // Check that affix 100 exists with "ler" suffix + a100, ok := aff.AffixMap["100"] + if !ok { + t.Fatal("AffixMap missing flag 100") + } + if len(a100.Rules) != 1 || a100.Rules[0].AffixText != "ler" { + t.Errorf("flag 100 rules = %v, want [{ler}]", a100.Rules) + } + + // Check that affix 200 exists with "in" suffix + a200, ok := aff.AffixMap["200"] + if !ok { + t.Fatal("AffixMap missing flag 200") + } + if len(a200.Rules) != 1 || a200.Rules[0].AffixText != "in" { + t.Errorf("flag 200 rules = %v, want [{in}]", a200.Rules) + } +} + +func TestFlagNumExpand(t *testing.T) { + affContent := `SET UTF-8 +FLAG num + +SFX 100 N 1 +SFX 100 0 ler . + +SFX 200 N 1 +SFX 200 0 in . +` + aff, err := newDictConfig(strings.NewReader(affContent)) + if err != nil { + t.Fatalf("newDictConfig error: %v", err) + } + + // "belge/100,200" should expand to: belge, belgeler, belgein + words, err := aff.expand("belge/100,200", nil) + if err != nil { + t.Fatalf("expand error: %v", err) + } + + expected := map[string]bool{"belge": true, "belgeler": true, "belgein": true} + for _, w := range words { + if !expected[w] { + t.Errorf("unexpected word %q in expansion", w) + } + delete(expected, w) + } + for w := range expected { + t.Errorf("missing expected word %q", w) + } +} + +func TestFlagNumGoSpellReader(t *testing.T) { + affContent := `SET UTF-8 +FLAG num + +SFX 100 N 1 +SFX 100 0 ler . + +SFX 200 N 1 +SFX 200 0 nin . +` + dicContent := `2 +belge/100,200 +sistem/100,200 +` + + gs, err := newGoSpellReader( + strings.NewReader(affContent), + strings.NewReader(dicContent), + ) + if err != nil { + t.Fatalf("newGoSpellReader error: %v", err) + } + + tests := []struct { + word string + want bool + }{ + {"belge", true}, + {"belgeler", true}, + {"belgenin", true}, + {"sistem", true}, + {"sistemler", true}, + {"sistemnin", true}, + {"bilinmeyen", false}, + } + + for _, tt := range tests { + got := gs.spell(tt.word) + if got != tt.want { + t.Errorf("spell(%q) = %v, want %v", tt.word, got, tt.want) + } + } +} + +func TestASCIFlagBackwardCompatibility(t *testing.T) { + // Original ASCII flag format must still work + affContent := `SET UTF-8 + +SFX A N 1 +SFX A 0 s . + +SFX B N 1 +SFX B 0 ed . +` + dicContent := `1 +test/AB +` + + gs, err := newGoSpellReader( + strings.NewReader(affContent), + strings.NewReader(dicContent), + ) + if err != nil { + t.Fatalf("newGoSpellReader error: %v", err) + } + + tests := []struct { + word string + want bool + }{ + {"test", true}, + {"tests", true}, + {"tested", true}, + {"testing", false}, + } + + for _, tt := range tests { + got := gs.spell(tt.word) + if got != tt.want { + t.Errorf("spell(%q) = %v, want %v", tt.word, got, tt.want) + } + } +} diff --git a/internal/spell/gospell.go b/internal/spell/gospell.go index cdfec6cde..7eb8652f9 100644 --- a/internal/spell/gospell.go +++ b/internal/spell/gospell.go @@ -220,14 +220,17 @@ func newGoSpellReader(aff, dic io.Reader) (*goSpell, error) { for _, compoundRule := range affix.CompoundRule { pattern := "^" - for _, key := range compoundRule { - switch key { - case '(', ')', '+', '?', '*': - pattern += regexp.QuoteMeta(string(key)) - default: - groups := affix.compoundMap[key] - pattern = pattern + "(" + strings.Join(groups, "|") + ")" + for _, key := range affix.parseFlags(compoundRule) { + if len(key) == 1 { + r := rune(key[0]) + switch r { + case '(', ')', '+', '?', '*': + pattern += regexp.QuoteMeta(key) + continue + } } + groups := affix.compoundMap[key] + pattern = pattern + "(" + strings.Join(groups, "|") + ")" } pattern += "$"