Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
61b0822
feat: add 10 new languages (1.67B speakers) with grapheme mode and wo…
Hugo0 Mar 14, 2026
6984e29
feat: add word history freeze safety (Layer 1 + Layer 2)
Hugo0 Mar 14, 2026
d805d54
fix: add global profanity blocklist for daily word selection
Hugo0 Mar 14, 2026
1b14881
fix: address all CodeRabbit review findings
Hugo0 Mar 14, 2026
b8ccb29
feat: decontaminate daily words for 25+ existing languages
Hugo0 Mar 14, 2026
d28bee8
feat: add Hausa, Punjabi; boost Yoruba; decontaminate 25+ existing la…
Hugo0 Mar 14, 2026
d4d0a21
fix: revert decontamination for languages with small native dictionaries
Hugo0 Mar 14, 2026
96f45f5
feat: add Leipzig-based frequency ranking for 7 more languages
Hugo0 Mar 14, 2026
dae4a9c
feat: add frequency-ranked daily words for Bengali and Korean
Hugo0 Mar 14, 2026
8a7250c
refactor: rename curated_schedule → word_history, document legacy day…
Hugo0 Mar 14, 2026
457b0cb
feat: expand supplements with Leipzig (+436K words), re-decontaminate…
Hugo0 Mar 14, 2026
dc0a4a9
feat: expand supplements for grapheme languages (+59K words)
Hugo0 Mar 14, 2026
6c51c61
fix: remove 19 blocklisted compound jongseong words from Korean daily…
Hugo0 Mar 14, 2026
6e6fd3a
fix: revert Finnish daily_words, remove 454 garbage words from supple…
Hugo0 Mar 14, 2026
8ea23dc
feat: add Japanese (日本語) — 5-hiragana Wordle with 5K words
Hugo0 Mar 14, 2026
e019eff
fix: remove internet/brand terms from daily words (gmail, https, emai…
Hugo0 Mar 14, 2026
419783f
fix: remove 193 brand names, politicians, tech terms from daily words
Hugo0 Mar 14, 2026
df2ca04
feat: add dictionary verification gate for daily word selection
Hugo0 Mar 14, 2026
1a7599e
feat: add frequency-ranked Japanese daily words (1,646 words)
Hugo0 Mar 14, 2026
c9106c4
fix: rebuild Japanese word lists with strict quality filtering
Hugo0 Mar 14, 2026
462541a
refactor: unify word pipeline — one code path for all languages
Hugo0 Mar 14, 2026
ef5ad5b
docs: add YAML migration architecture plan
Hugo0 Mar 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,4 @@ webapp/static/word-history/

# Generated share images (rebuilt on deploy)
webapp/static/images/share/
scripts/.freq_data
411 changes: 411 additions & 0 deletions docs/WORD_DATA_ARCHITECTURE.md

Large diffs are not rendered by default.

19 changes: 12 additions & 7 deletions frontend/src/game.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import { haptic, setHapticsEnabled } from './haptics';
import { sound, setSoundEnabled } from './sounds';
import { buildNormalizeMap, buildNormalizedWordMap, normalizeWord } from './diacritics';
import { buildFinalFormReverseMap, toFinalForm, toRegularForm } from './positional';
import { splitWord } from './graphemes';
import analytics from './analytics';
import { identifyUser, updateUserProperties } from './posthog';
import { calculateCommunityPercentile } from './stats';
Expand Down Expand Up @@ -623,10 +624,14 @@ export const createGameApp = () => {
return fullNormalize(c1) === fullNormalize(c2);
};

// Split target word into characters (grapheme clusters for Hindi, codepoints otherwise)
const graphemeMode = this.config?.grapheme_mode === 'true';
const targetChars = splitWord(targetWord, graphemeMode);

// Count characters in target word using FULLY NORMALIZED forms
// This ensures "ä" and "a" are counted together, and "כ" and "ך" are counted together
const charCounts: Record<string, number> = {};
for (const char of targetWord) {
for (const char of targetChars) {
const normalizedChar = fullNormalize(char);
charCounts[normalizedChar] = (charCounts[normalizedChar] || 0) + 1;
}
Expand All @@ -641,7 +646,7 @@ export const createGameApp = () => {
// First pass: mark correct positions (using normalized comparison)
for (let i = 0; i < row.length; i++) {
const guessChar = row[i];
const targetChar = targetWord[i];
const targetChar = targetChars[i];
if (guessChar && targetChar && fullCharsMatch(guessChar, targetChar)) {
// Use splice for Vue 3 reactivity
classes.splice(i, 1, `correct ${baseClass}`);
Expand All @@ -666,9 +671,7 @@ export const createGameApp = () => {
const count = charCounts[normalizedGuess];

// Check if this normalized character exists in target (also normalized)
const targetHasChar = [...targetWord].some((tc) =>
fullCharsMatch(guessChar, tc)
);
const targetHasChar = targetChars.some((tc) => fullCharsMatch(guessChar, tc));

if (targetHasChar && count !== undefined && count > 0) {
// Use splice for Vue 3 reactivity
Expand Down Expand Up @@ -828,8 +831,10 @@ export const createGameApp = () => {
// Update tiles to show canonical form (with diacritics)
// This displays the correct accented letters after submission
if (row && canonicalWord !== typedWord) {
for (let i = 0; i < canonicalWord.length; i++) {
row.splice(i, 1, canonicalWord[i]);
const graphemeMode = this.config?.grapheme_mode === 'true';
const canonicalChars = splitWord(canonicalWord, graphemeMode);
for (let i = 0; i < canonicalChars.length; i++) {
row.splice(i, 1, canonicalChars[i]);
}
}

Expand Down
18 changes: 18 additions & 0 deletions frontend/src/graphemes.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/**
* Grapheme cluster utilities for languages where one visual character
* spans multiple Unicode codepoints (e.g., Hindi/Devanagari aksharas).
*
* Uses Intl.Segmenter (ES2022) — supported in Chrome 87+, Firefox 104+, Safari 15.4+.
*/

const segmenter = new Intl.Segmenter(undefined, { granularity: 'grapheme' });

/**
* Split a word into characters, respecting grapheme mode.
* When grapheme_mode is enabled, returns grapheme clusters.
* Otherwise returns individual codepoints (default behavior).
*/
export function splitWord(word: string, graphemeMode: boolean): string[] {
if (!graphemeMode) return [...word];
return [...segmenter.segment(word)].map((s) => s.segment);
}
5 changes: 5 additions & 0 deletions frontend/src/types/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ export interface LanguageConfig {
* Example for Korean: { "KeyQ": "ㅂ", "ShiftKeyQ": "ㅃ", ... }
*/
physical_key_map?: Record<string, string>;
/** When "true", word length is counted by grapheme clusters instead of codepoints.
* Required for scripts like Devanagari where one visual character (akshar)
* spans multiple Unicode codepoints.
*/
grapheme_mode?: 'true' | 'false';
}

// =============================================================================
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ dependencies = [
"arabic-reshaper>=3.0.0",
"flask>=3.1.0",
"flask-cors>=6.0.0",
"grapheme>=0.6.0",
"gunicorn>=24.0.0",
"openai>=2.21.0",
"pillow>=12.1.1",
Expand Down
Loading
Loading