Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions kiva/fonttools/text/_language.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# (C) Copyright 2005-2021 Enthought, Inc., Austin, TX
# All rights reserved.
#
# This software is provided without warranty under the terms of the BSD
# license included in LICENSE.txt and may be redistributed only under
# the conditions described in the aforementioned license. The license
# is also available online at http://www.enthought.com/licenses/BSD.txt
#
# Thanks for using Enthought open source!
import locale

from kiva.fonttools.text._data import SCRIPTS

# Derived from kiva.fonttools._util:
# `_ot_code_page_masks` and `_ot_unicode_range_bits`
# These are the font languages which we recognize
_FONT_LANGUAGES = [
"Arabic", "Armenian", "Balinese", "Bengali", "Buginese",
"Canadian_Aboriginal", "Cherokee", "Coptic", "Cyrillic", "Deseret",
"Devanagari", "Ethiopic", "Georgia", "Glagolitic", "Gothic", "Greek",
"Gujarati", "Gurmukhi", "Hebrew", "Japanese", "Kannada", "Khmer", "Korean",
"Lao", "Latin", "Limbu", "Malayalam", "Math", "Mongolian", "Myanmar",
"New_Tai_Lue", "Nko", "Ogham", "Oriya", "Phoenician", "Runic",
"Simplified Chinese", "Sinhala", "Symbol", "Syriac", "Tai_Le", "Tamil",
"Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Traditional Chinese",
"Vai", "Vietnamese",
]


def build_script_to_language_map():
""" Create a dictionary which maps from script name (from `SCRIPTS`) to
font language.

NOTE: The langauge for a given script is locale dependent.
"""
locale_lang = locale.getdefaultlocale()[0]

if locale_lang == "C":
locale_lang = "en_US"

# Pick a language to use for "Han" script
han_lang = "Traditional Chinese" # Default
if locale_lang in ("zh_CN", "zh_SG"):
han_lang = "Simplified Chinese"
elif locale_lang.startswith("ja"):
han_lang = "Japanese"
elif locale_lang.startswith("ko"):
han_lang = "Korean"
Comment on lines +41 to +48
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having played with this a bit more, we should only use this choice when it's not otherwise clear from the context. For instance if a string already contains Hiragana or Katakana, then Han should be mapped to "Japanese". If Hangul is encountered, Han maps to "Korean". Only if the Han is mixed with some non-CJK language should we fall back to this locale-based guess.


# Mapping from script -> langauge that we're _mostly_ sure about
known_mappings = {
# Special script properties
"Common": "Common",
"Inherited": "Inherited",
"Unknown": "Unknown",

# Scripts which infer the writing system
"Bopomofo": "Traditional Chinese", # XXX: Taiwan only?
"Han": han_lang,
"Hangul": "Korean",
"Hiragana": "Japanese",
"Katakana": "Japanese",
}

mapping = {}
for script in SCRIPTS:
if script in known_mappings:
mapping[script] = known_mappings[script]
elif script in _FONT_LANGUAGES:
mapping[script] = script
else:
mapping[script] = "Latin"

return mapping
16 changes: 11 additions & 5 deletions kiva/fonttools/text/_unicode_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np

from kiva.fonttools.text._data import ENTRIES
from kiva.fonttools.text._language import build_script_to_language_map


class UnicodeAnalyzer:
Expand All @@ -19,24 +20,25 @@ class UnicodeAnalyzer:
def __init__(self):
self.ranges = np.array([e[:2] for e in ENTRIES], dtype=np.int32)
self.values = [e[2:] for e in ENTRIES]
self.lang_map = build_script_to_language_map()

def languages(self, text):
""" Given a Unicode string, return the languages that it contains.
"""
result = []
last_lang = ""
last_lang = "Common"
last_start = 0

# XXX: Should this be normalized first?
for idx, cp in enumerate(text):
lang, _ = self._lookup_codepoint(cp)
if lang != last_lang:
if lang != last_lang and lang not in ("Inherited", "Unknown"):
if idx > 0:
result.append((last_start, idx, last_lang))
result.append((last_start, idx, self.lang_map[last_lang]))
last_lang = lang
last_start = idx

result.append((last_start, idx + 1, last_lang))
result.append((last_start, idx + 1, self.lang_map[last_lang]))

return result

Expand All @@ -49,5 +51,9 @@ def _lookup_codepoint(self, cp):
# or negative and whose end is zero or positive. That should only be
# True in one location, so we get the index of that location.
comps = self.ranges - ord(cp)
index = ((comps[:, 0] <= 0) == (comps[:, 1] >= 0)).argmax()
below_and_above = ((comps[:, 0] <= 0) == (comps[:, 1] >= 0))
if not below_and_above.any():
return ("Unknown", "Zz")

index = below_and_above.argmax()
return self.values[index]
22 changes: 20 additions & 2 deletions kiva/fonttools/text/tests/test_unicode_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,29 @@ def test_sample_strings(self):
res = an.languages(st)
self.assertListEqual(res, [(0, len(st), "Common")])

st = "안녕하세요"
res = an.languages(st)
self.assertListEqual(res, [(0, len(st), "Korean")])

st = "こんにちは"
res = an.languages(st)
self.assertListEqual(res, [(0, len(st), "Japanese")])

def test_locale_dependent(self):
an = UnicodeAnalyzer()

# "Han" script is mapped to a language related to the default locale.
han_language = an.lang_map["Han"]

st = "你好世界"
res = an.languages(st)
self.assertListEqual(res, [(0, len(st), "Han")])
self.assertListEqual(res, [(0, len(st), han_language)])

st = "Kiva Graphics一番😎"
expected = [(0, 13, "Common"), (13, 15, "Han"), (15, 16, 'Common')]
expected = [
(0, 13, "Common"),
(13, 15, han_language),
(15, 16, 'Common'),
]
res = an.languages(st)
self.assertListEqual(res, expected)