enthought · jwiggins · Apr 6, 2021 · Mar 30, 2021 · jwiggins · Apr 6, 2021
diff --git a/kiva/fonttools/text/_language.py b/kiva/fonttools/text/_language.py
@@ -0,0 +1,74 @@
+# (C) Copyright 2005-2021 Enthought, Inc., Austin, TX
+# All rights reserved.
+#
+# This software is provided without warranty under the terms of the BSD
+# license included in LICENSE.txt and may be redistributed only under
+# the conditions described in the aforementioned license. The license
+# is also available online at http://www.enthought.com/licenses/BSD.txt
+#
+# Thanks for using Enthought open source!
+import locale
+
+from kiva.fonttools.text._data import SCRIPTS
+
+# Derived from kiva.fonttools._util:
+# `_ot_code_page_masks` and `_ot_unicode_range_bits`
+# These are the font languages which we recognize
+_FONT_LANGUAGES = [
+    "Arabic", "Armenian", "Balinese", "Bengali", "Buginese",
+    "Canadian_Aboriginal", "Cherokee", "Coptic", "Cyrillic", "Deseret",
+    "Devanagari", "Ethiopic", "Georgia", "Glagolitic", "Gothic", "Greek",
+    "Gujarati", "Gurmukhi", "Hebrew", "Japanese", "Kannada", "Khmer", "Korean",
+    "Lao", "Latin", "Limbu", "Malayalam", "Math", "Mongolian", "Myanmar",
+    "New_Tai_Lue", "Nko", "Ogham", "Oriya", "Phoenician", "Runic",
+    "Simplified Chinese", "Sinhala", "Symbol", "Syriac", "Tai_Le", "Tamil",
+    "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Traditional Chinese",
+    "Vai", "Vietnamese",
+]
+
+
+def build_script_to_language_map():
+    """ Create a dictionary which maps from script name (from `SCRIPTS`) to
+    font language.
+
+    NOTE: The langauge for a given script is locale dependent.
+    """
+    locale_lang = locale.getdefaultlocale()[0]
+
+    if locale_lang == "C":
+        locale_lang = "en_US"
+
+    # Pick a language to use for "Han" script
+    han_lang = "Traditional Chinese"  # Default
+    if locale_lang in ("zh_CN", "zh_SG"):
+        han_lang = "Simplified Chinese"
+    elif locale_lang.startswith("ja"):
+        han_lang = "Japanese"
+    elif locale_lang.startswith("ko"):
+        han_lang = "Korean"
+
+    # Mapping from script -> langauge that we're _mostly_ sure about
+    known_mappings = {
+        # Special script properties
+        "Common": "Common",
+        "Inherited": "Inherited",
+        "Unknown": "Unknown",
+
+        # Scripts which infer the writing system
+        "Bopomofo": "Traditional Chinese",  # XXX: Taiwan only?
+        "Han": han_lang,
+        "Hangul": "Korean",
+        "Hiragana": "Japanese",
+        "Katakana": "Japanese",
+    }
+
+    mapping = {}
+    for script in SCRIPTS:
+        if script in known_mappings:
+            mapping[script] = known_mappings[script]
+        elif script in _FONT_LANGUAGES:
+            mapping[script] = script
+        else:
+            mapping[script] = "Latin"
+
+    return mapping
diff --git a/kiva/fonttools/text/_unicode_lookup.py b/kiva/fonttools/text/_unicode_lookup.py
@@ -10,6 +10,7 @@
 import numpy as np
 
 from kiva.fonttools.text._data import ENTRIES
+from kiva.fonttools.text._language import build_script_to_language_map
 
 
 class UnicodeAnalyzer:
@@ -19,24 +20,25 @@ class UnicodeAnalyzer:
     def __init__(self):
         self.ranges = np.array([e[:2] for e in ENTRIES], dtype=np.int32)
         self.values = [e[2:] for e in ENTRIES]
+        self.lang_map = build_script_to_language_map()
 
     def languages(self, text):
         """ Given a Unicode string, return the languages that it contains.
         """
         result = []
-        last_lang = ""
+        last_lang = "Common"
         last_start = 0
 
         # XXX: Should this be normalized first?
         for idx, cp in enumerate(text):
             lang, _ = self._lookup_codepoint(cp)
-            if lang != last_lang:
+            if lang != last_lang and lang not in ("Inherited", "Unknown"):
                 if idx > 0:
-                    result.append((last_start, idx, last_lang))
+                    result.append((last_start, idx, self.lang_map[last_lang]))
                 last_lang = lang
                 last_start = idx
 
-        result.append((last_start, idx + 1, last_lang))
+        result.append((last_start, idx + 1, self.lang_map[last_lang]))
 
         return result
 
@@ -49,5 +51,9 @@ def _lookup_codepoint(self, cp):
         # or negative and whose end is zero or positive. That should only be
         # True in one location, so we get the index of that location.
         comps = self.ranges - ord(cp)
-        index = ((comps[:, 0] <= 0) == (comps[:, 1] >= 0)).argmax()
+        below_and_above = ((comps[:, 0] <= 0) == (comps[:, 1] >= 0))
+        if not below_and_above.any():
+            return ("Unknown", "Zz")
+
+        index = below_and_above.argmax()
         return self.values[index]
diff --git a/kiva/fonttools/text/tests/test_unicode_lookup.py b/kiva/fonttools/text/tests/test_unicode_lookup.py
@@ -20,11 +20,29 @@ def test_sample_strings(self):
         res = an.languages(st)
         self.assertListEqual(res, [(0, len(st), "Common")])
 
+        st = "안녕하세요"
+        res = an.languages(st)
+        self.assertListEqual(res, [(0, len(st), "Korean")])
+
+        st = "こんにちは"
+        res = an.languages(st)
+        self.assertListEqual(res, [(0, len(st), "Japanese")])
+
+    def test_locale_dependent(self):
+        an = UnicodeAnalyzer()
+
+        # "Han" script is mapped to a language related to the default locale.
+        han_language = an.lang_map["Han"]
+
         st = "你好世界"
         res = an.languages(st)
-        self.assertListEqual(res, [(0, len(st), "Han")])
+        self.assertListEqual(res, [(0, len(st), han_language)])
 
         st = "Kiva Graphics一番😎"
-        expected = [(0, 13, "Common"), (13, 15, "Han"), (15, 16, 'Common')]
+        expected = [
+            (0, 13, "Common"),
+            (13, 15, han_language),
+            (15, 16, 'Common'),
+        ]
         res = an.languages(st)
         self.assertListEqual(res, expected)