From 3722787486d469293fb219d000a9ccdd3762eda9 Mon Sep 17 00:00:00 2001 From: Baptiste Lagarde Date: Sun, 2 Feb 2014 10:53:44 +1100 Subject: [PATCH 1/2] Major refactor. This should make it easier to maintain. --- setup.py | 6 +- src/romkan/common.py | 458 ++--------------------------------------- src/romkan/compat.py | 14 ++ src/romkan/data.py | 67 ++++++ src/romkan/mapping.csv | 238 +++++++++++++++++++++ src/romkan/utils.py | 44 ++++ tests/test.py | 4 +- 7 files changed, 387 insertions(+), 444 deletions(-) create mode 100644 src/romkan/compat.py create mode 100644 src/romkan/data.py create mode 100644 src/romkan/mapping.csv create mode 100644 src/romkan/utils.py diff --git a/setup.py b/setup.py index 75afc02..c80035d 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ PROJ_METADATA = '%s.json' % PROJ_NAME -import os, json, imp +import os, json, imp, sys here = os.path.abspath(os.path.dirname(__file__)) proj_info = json.loads(open(os.path.join(here, PROJ_METADATA)).read()) @@ -13,6 +13,9 @@ CHANGELOG = open(os.path.join(here, 'CHANGELOG.rst')).read() VERSION = imp.load_source('version', os.path.join(here, 'src/%s/version.py' % PACKAGE_NAME)).__version__ +PY2 = sys.version_info[0] <= 2 +deps = ['unicodecsv'] if PY2 else [] + from setuptools import setup, find_packages setup( name = proj_info['name'], @@ -32,6 +35,7 @@ package_dir = {'' : 'src'}, test_suite = 'tests', + install_requires = deps, platforms = 'any', zip_safe = False, diff --git a/src/romkan/common.py b/src/romkan/common.py index 4499c74..b72ddb2 100644 --- a/src/romkan/common.py +++ b/src/romkan/common.py @@ -3,8 +3,10 @@ from __future__ import unicode_literals from .version import __version__ - import re +from romkan.data import ROMKAN, KANROM, KUNREI, HEPBURN +from romkan.data import hira_strict, kata +from romkan.utils import is_romaji, is_kana try: from functools import cmp_to_key except ImportError: @@ -43,344 +45,6 @@ def __ne__(self, other): # the Ruby's licence. # -# This table is imported from KAKASI and modified. - -KUNREITAB = """ァ xa ア a ィ xi イ i ゥ xu -ウ u ヴ vu ヴァ va ヴィ vi ヴェ ve -ヴォ vo ェ xe エ e ォ xo オ o - -カ ka ガ ga キ ki キャ kya キュ kyu -キョ kyo ギ gi ギャ gya ギュ gyu ギョ gyo -ク ku グ gu ケ ke ゲ ge コ ko -ゴ go - -サ sa ザ za シ si シャ sya シュ syu -ショ syo シェ sye -ジ zi ジャ zya ジュ zyu ジョ zyo -ス su ズ zu セ se ゼ ze ソ so -ゾ zo - -タ ta ダ da チ ti チャ tya チュ tyu -チョ tyo ヂ di ヂャ dya ヂュ dyu ヂョ dyo -ティ ti - -ッ xtu -ッヴ vvu ッヴァ vva ッヴィ vvi -ッヴェ vve ッヴォ vvo -ッカ kka ッガ gga ッキ kki ッキャ kkya -ッキュ kkyu ッキョ kkyo ッギ ggi ッギャ ggya -ッギュ ggyu ッギョ ggyo ック kku ッグ ggu -ッケ kke ッゲ gge ッコ kko ッゴ ggo ッサ ssa -ッザ zza ッシ ssi ッシャ ssya -ッシュ ssyu ッショ ssyo ッシェ ssye -ッジ zzi ッジャ zzya ッジュ zzyu ッジョ zzyo -ッス ssu ッズ zzu ッセ sse ッゼ zze ッソ sso -ッゾ zzo ッタ tta ッダ dda ッチ tti ッティ tti -ッチャ ttya ッチュ ttyu ッチョ ttyo ッヂ ddi -ッヂャ ddya ッヂュ ddyu ッヂョ ddyo ッツ ttu -ッヅ ddu ッテ tte ッデ dde ット tto ッド ddo -ッドゥ ddu -ッハ hha ッバ bba ッパ ppa ッヒ hhi -ッヒャ hhya ッヒュ hhyu ッヒョ hhyo ッビ bbi -ッビャ bbya ッビュ bbyu ッビョ bbyo ッピ ppi -ッピャ ppya ッピュ ppyu ッピョ ppyo ッフ hhu ッフュ ffu -ッファ ffa ッフィ ffi ッフェ ffe ッフォ ffo -ッブ bbu ップ ppu ッヘ hhe ッベ bbe ッペ ppe -ッホ hho ッボ bbo ッポ ppo ッヤ yya ッユ yyu -ッヨ yyo ッラ rra ッリ rri ッリャ rrya -ッリュ rryu ッリョ rryo ッル rru ッレ rre -ッロ rro - -ツ tu ヅ du テ te デ de ト to -ド do ドゥ du - -ナ na ニ ni ニャ nya ニュ nyu ニョ nyo -ヌ nu ネ ne ノ no - -ハ ha バ ba パ pa ヒ hi ヒャ hya -ヒュ hyu ヒョ hyo ビ bi ビャ bya ビュ byu -ビョ byo ピ pi ピャ pya ピュ pyu ピョ pyo -フ hu ファ fa フィ fi フェ fe フォ fo -フュ fu -ブ bu プ pu ヘ he ベ be ペ pe -ホ ho ボ bo ポ po - -マ ma ミ mi ミャ mya ミュ myu ミョ myo -ム mu メ me モ mo - -ャ xya ヤ ya ュ xyu ユ yu ョ xyo -ヨ yo - -ラ ra リ ri リャ rya リュ ryu リョ ryo -ル ru レ re ロ ro - -ヮ xwa ワ wa ウィ wi ヰ wi ヱ we ウェ we -ヲ wo ウォ wo ン n - -ン n' -ディ dyi -ー - -チェ tye -ッチェ ttye -ジェ zye -""" - -KUNREITAB_H = """ぁ xa あ a ぃ xi い i ぅ xu -う u う゛ vu う゛ぁ va う゛ぃ vi う゛ぇ ve -う゛ぉ vo ぇ xe え e ぉ xo お o - -か ka が ga き ki きゃ kya きゅ kyu -きょ kyo ぎ gi ぎゃ gya ぎゅ gyu ぎょ gyo -く ku ぐ gu け ke げ ge こ ko -ご go - -さ sa ざ za し si しゃ sya しゅ syu -しょ syo じ zi じゃ zya じゅ zyu じょ zyo -す su ず zu せ se ぜ ze そ so -ぞ zo - -た ta だ da ち ti ちゃ tya ちゅ tyu -ちょ tyo ぢ di ぢゃ dya ぢゅ dyu ぢょ dyo - -っ xtu -っう゛ vvu っう゛ぁ vva っう゛ぃ vvi -っう゛ぇ vve っう゛ぉ vvo -っか kka っが gga っき kki っきゃ kkya -っきゅ kkyu っきょ kkyo っぎ ggi っぎゃ ggya -っぎゅ ggyu っぎょ ggyo っく kku っぐ ggu -っけ kke っげ gge っこ kko っご ggo っさ ssa -っざ zza っし ssi っしゃ ssya -っしゅ ssyu っしょ ssyo -っじ zzi っじゃ zzya っじゅ zzyu っじょ zzyo -っす ssu っず zzu っせ sse っぜ zze っそ sso -っぞ zzo った tta っだ dda っち tti -っちゃ ttya っちゅ ttyu っちょ ttyo っぢ ddi -っぢゃ ddya っぢゅ ddyu っぢょ ddyo っつ ttu -っづ ddu って tte っで dde っと tto っど ddo -っは hha っば bba っぱ ppa っひ hhi -っひゃ hhya っひゅ hhyu っひょ hhyo っび bbi -っびゃ bbya っびゅ bbyu っびょ bbyo っぴ ppi -っぴゃ ppya っぴゅ ppyu っぴょ ppyo っふ hhu -っふぁ ffa っふぃ ffi っふぇ ffe っふぉ ffo -っぶ bbu っぷ ppu っへ hhe っべ bbe っぺ ppe -っほ hho っぼ bbo っぽ ppo っや yya っゆ yyu -っよ yyo っら rra っり rri っりゃ rrya -っりゅ rryu っりょ rryo っる rru っれ rre -っろ rro - -つ tu づ du て te で de と to -ど do - -な na に ni にゃ nya にゅ nyu にょ nyo -ぬ nu ね ne の no - -は ha ば ba ぱ pa ひ hi ひゃ hya -ひゅ hyu ひょ hyo び bi びゃ bya びゅ byu -びょ byo ぴ pi ぴゃ pya ぴゅ pyu ぴょ pyo -ふ hu ふぁ fa ふぃ fi ふぇ fe ふぉ fo -ぶ bu ぷ pu へ he べ be ぺ pe -ほ ho ぼ bo ぽ po - -ま ma み mi みゃ mya みゅ myu みょ myo -む mu め me も mo - -ゃ xya や ya ゅ xyu ゆ yu ょ xyo -よ yo - -ら ra り ri りゃ rya りゅ ryu りょ ryo -る ru れ re ろ ro - -ゎ xwa わ wa ゐ wi ゑ we -を wo ん n - -ん n' -でぃ dyi -ー - -ちぇ tye -っちぇ ttye -じぇ zye -""" - -HEPBURNTAB = """ァ xa ア a ィ xi イ i ゥ xu -ウ u ヴ vu ヴァ va ヴィ vi ヴェ ve -ヴォ vo ェ xe エ e ォ xo オ o - - -カ ka ガ ga キ ki キャ kya キュ kyu -キョ kyo ギ gi ギャ gya ギュ gyu ギョ gyo -ク ku グ gu ケ ke ゲ ge コ ko -ゴ go - -サ sa ザ za シ shi シャ sha シュ shu -ショ sho シェ she -ジ ji ジャ ja ジュ ju ジョ jo -ス su ズ zu セ se ゼ ze ソ so -ゾ zo - -タ ta ダ da チ chi チャ cha チュ chu -チョ cho ヂ di ヂャ dya ヂュ dyu ヂョ dyo -ティ ti - -ッ xtsu -ッヴ vvu ッヴァ vva ッヴィ vvi -ッヴェ vve ッヴォ vvo -ッカ kka ッガ gga ッキ kki ッキャ kkya -ッキュ kkyu ッキョ kkyo ッギ ggi ッギャ ggya -ッギュ ggyu ッギョ ggyo ック kku ッグ ggu -ッケ kke ッゲ gge ッコ kko ッゴ ggo ッサ ssa -ッザ zza ッシ sshi ッシャ ssha -ッシュ sshu ッショ ssho ッシェ sshe -ッジ jji ッジャ jja ッジュ jju ッジョ jjo -ッス ssu ッズ zzu ッセ sse ッゼ zze ッソ sso -ッゾ zzo ッタ tta ッダ dda ッチ cchi ッティ tti -ッチャ ccha ッチュ cchu ッチョ ccho ッヂ ddi -ッヂャ ddya ッヂュ ddyu ッヂョ ddyo ッツ ttsu -ッヅ ddu ッテ tte ッデ dde ット tto ッド ddo -ッドゥ ddu -ッハ hha ッバ bba ッパ ppa ッヒ hhi -ッヒャ hhya ッヒュ hhyu ッヒョ hhyo ッビ bbi -ッビャ bbya ッビュ bbyu ッビョ bbyo ッピ ppi -ッピャ ppya ッピュ ppyu ッピョ ppyo ッフ ffu ッフュ ffu -ッファ ffa ッフィ ffi ッフェ ffe ッフォ ffo -ッブ bbu ップ ppu ッヘ hhe ッベ bbe ッペ ppe -ッホ hho ッボ bbo ッポ ppo ッヤ yya ッユ yyu -ッヨ yyo ッラ rra ッリ rri ッリャ rrya -ッリュ rryu ッリョ rryo ッル rru ッレ rre -ッロ rro - -ツ tsu ヅ du テ te デ de ト to -ド do ドゥ du - -ナ na ニ ni ニャ nya ニュ nyu ニョ nyo -ヌ nu ネ ne ノ no - -ハ ha バ ba パ pa ヒ hi ヒャ hya -ヒュ hyu ヒョ hyo ビ bi ビャ bya ビュ byu -ビョ byo ピ pi ピャ pya ピュ pyu ピョ pyo -フ fu ファ fa フィ fi フェ fe フォ fo -フュ fu -ブ bu プ pu ヘ he ベ be ペ pe -ホ ho ボ bo ポ po - -マ ma ミ mi ミャ mya ミュ myu ミョ myo -ム mu メ me モ mo - -ャ xya ヤ ya ュ xyu ユ yu ョ xyo -ヨ yo - -ラ ra リ ri リャ rya リュ ryu リョ ryo -ル ru レ re ロ ro - -ヮ xwa ワ wa ウィ wi ヰ wi ヱ we ウェ we -ヲ wo ウォ wo ン n - -ン n' -ディ di -ー - -チェ che -ッチェ cche -ジェ je -""" - -HEPBURNTAB_H = """ぁ xa あ a ぃ xi い i ぅ xu -う u う゛ vu う゛ぁ va う゛ぃ vi う゛ぇ ve -う゛ぉ vo ぇ xe え e ぉ xo お o - - -か ka が ga き ki きゃ kya きゅ kyu -きょ kyo ぎ gi ぎゃ gya ぎゅ gyu ぎょ gyo -く ku ぐ gu け ke げ ge こ ko -ご go - -さ sa ざ za し shi しゃ sha しゅ shu -しょ sho じ ji じゃ ja じゅ ju じょ jo -す su ず zu せ se ぜ ze そ so -ぞ zo - -た ta だ da ち chi ちゃ cha ちゅ chu -ちょ cho ぢ di ぢゃ dya ぢゅ dyu ぢょ dyo - -っ xtsu -っう゛ vvu っう゛ぁ vva っう゛ぃ vvi -っう゛ぇ vve っう゛ぉ vvo -っか kka っが gga っき kki っきゃ kkya -っきゅ kkyu っきょ kkyo っぎ ggi っぎゃ ggya -っぎゅ ggyu っぎょ ggyo っく kku っぐ ggu -っけ kke っげ gge っこ kko っご ggo っさ ssa -っざ zza っし sshi っしゃ ssha -っしゅ sshu っしょ ssho -っじ jji っじゃ jja っじゅ jju っじょ jjo -っす ssu っず zzu っせ sse っぜ zze っそ sso -っぞ zzo った tta っだ dda っち cchi -っちゃ ccha っちゅ cchu っちょ ccho っぢ ddi -っぢゃ ddya っぢゅ ddyu っぢょ ddyo っつ ttsu -っづ ddu って tte っで dde っと tto っど ddo -っは hha っば bba っぱ ppa っひ hhi -っひゃ hhya っひゅ hhyu っひょ hhyo っび bbi -っびゃ bbya っびゅ bbyu っびょ bbyo っぴ ppi -っぴゃ ppya っぴゅ ppyu っぴょ ppyo っふ ffu -っふぁ ffa っふぃ ffi っふぇ ffe っふぉ ffo -っぶ bbu っぷ ppu っへ hhe っべ bbe っぺ ppe -っほ hho っぼ bbo っぽ ppo っや yya っゆ yyu -っよ yyo っら rra っり rri っりゃ rrya -っりゅ rryu っりょ rryo っる rru っれ rre -っろ rro - -つ tsu づ du て te で de と to -ど do - -な na に ni にゃ nya にゅ nyu にょ nyo -ぬ nu ね ne の no - -は ha ば ba ぱ pa ひ hi ひゃ hya -ひゅ hyu ひょ hyo び bi びゃ bya びゅ byu -びょ byo ぴ pi ぴゃ pya ぴゅ pyu ぴょ pyo -ふ fu ふぁ fa ふぃ fi ふぇ fe ふぉ fo -ぶ bu ぷ pu へ he べ be ぺ pe -ほ ho ぼ bo ぽ po - -ま ma み mi みゃ mya みゅ myu みょ myo -む mu め me も mo - -ゃ xya や ya ゅ xyu ゆ yu ょ xyo -よ yo - -ら ra り ri りゃ rya りゅ ryu りょ ryo -る ru れ re ろ ro - -ゎ xwa わ wa ゐ wi ゑ we -を wo ん n - -ん n' -でぃ dyi -ー - -ちぇ che -っちぇ cche -じぇ je -""" - -def pairs(arr, size=2): - for i in range(0, len(arr)-1, size): - yield arr[i:i+size] - - - -# Use Katakana - -KANROM = {} -ROMKAN = {} - -for pair in pairs(re.split("\s+", KUNREITAB + HEPBURNTAB)): - kana, roma = pair - KANROM[kana] = roma - ROMKAN[roma] = kana - -# special modification -# wo -> ヲ, but ヲ/ウォ -> wo -# du -> ヅ, but ヅ/ドゥ -> du -# we -> ウェ, ウェ -> we -ROMKAN.update( {"du": "ヅ", "di": "ヂ", "fu": "フ", "ti": "チ", - "wi": "ウィ", "we": "ウェ", "wo": "ヲ" } ) # Sort in long order so that a longer Romaji sequence precedes. @@ -390,8 +54,6 @@ def pairs(arr, size=2): _kanpat_cmp = lambda x, y: (len(y) > len(x)) - (len(y) < len(x)) or (len(KANROM[x]) > len(KANROM[x])) - (len(KANROM[x]) < len(KANROM[x])) KANPAT = re.compile("|".join(sorted(KANROM.keys(), key=cmp_to_key(_kanpat_cmp)))) -KUNREI = [y for (x, y) in pairs(re.split("\s+", KUNREITAB)) ] -HEPBURN = [y for (x, y) in pairs(re.split("\s+", HEPBURNTAB) )] KUNPAT = re.compile("|".join(sorted(KUNREI, key=_len_cmp)) ) HEPPAT = re.compile("|".join(sorted(HEPBURN, key=_len_cmp)) ) @@ -406,49 +68,6 @@ def pairs(arr, size=2): TO_HEPBURN.update( {'ti': 'chi' }) - -# Use Hiragana - -KANROM_H = {} -ROMKAN_H = {} - -for pair in pairs(re.split("\s+", KUNREITAB_H + HEPBURNTAB_H)): - kana, roma = pair - KANROM_H[kana] = roma - ROMKAN_H[roma] = kana - -# special modification -# wo -> ヲ, but ヲ/ウォ -> wo -# du -> ヅ, but ヅ/ドゥ -> du -# we -> ウェ, ウェ -> we -ROMKAN_H.update( {"du": "づ", "di": "ぢ", "fu": "ふ", "ti": "ち", - "wi": "うぃ", "we": "うぇ", "wo": "を" } ) - -# Sort in long order so that a longer Romaji sequence precedes. - -_len_cmp = lambda x: -len(x) -ROMPAT_H = re.compile("|".join(sorted(ROMKAN_H.keys(), key=_len_cmp)) ) - -_kanpat_cmp = lambda x, y: (len(y) > len(x)) - (len(y) < len(x)) or (len(KANROM_H[x]) > len(KANROM_H[x])) - (len(KANROM_H[x]) < len(KANROM_H[x])) -KANPAT_H = re.compile("|".join(sorted(KANROM_H.keys(), key=cmp_to_key(_kanpat_cmp)))) - -KUNREI_H = [y for (x, y) in pairs(re.split("\s+", KUNREITAB_H)) ] -HEPBURN_H = [y for (x, y) in pairs(re.split("\s+", HEPBURNTAB_H) )] - -KUNPAT_H = re.compile("|".join(sorted(KUNREI_H, key=_len_cmp)) ) -HEPPAT_H = re.compile("|".join(sorted(HEPBURN_H, key=_len_cmp)) ) - -TO_HEPBURN_H = {} -TO_KUNREI_H = {} - -for kun, hep in zip(KUNREI_H, HEPBURN_H): - TO_HEPBURN_H[kun] = hep - TO_KUNREI_H[hep] = kun - -TO_HEPBURN_H.update( {'ti': 'chi' }) - - - def normalize_double_n(str): """ Normalize double n. @@ -457,7 +76,7 @@ def normalize_double_n(str): # Replace double n with n' str = re.sub("nn", "n'", str) # Remove unnecessary apostrophes - str = re.sub("n'(?=[^aiueoyn]|$)", "n", str) + str = re.sub("n'(?=[^aeiuoyn]|$)", "n", str) return str @@ -476,48 +95,36 @@ def to_hiragana(str): """ Convert a Romaji (ローマ字) to a Hiragana (平仮名). """ - - str = str.lower() - str = normalize_double_n(str) - - tmp = ROMPAT_H.sub(lambda x: ROMKAN_H[x.group(0)], str) - return tmp + return hira_strict(to_katakana(str)) def to_kana(str): """ Convert a Romaji (ローマ字) to a Katakana (片仮名). (same as to_katakana) """ - return to_katakana(str) -def to_hepburn(str): +def to_hepburn(text): """ Convert a Kana (仮名) or a Kunrei-shiki Romaji (訓令式ローマ字) to a Hepburn Romaji (ヘボン式ローマ字). """ - - tmp = str - tmp = KANPAT.sub(lambda x: KANROM[x.group(0)], tmp) - tmp = KANPAT_H.sub(lambda x: KANROM_H[x.group(0)], tmp) - + assert is_romaji(text) or is_kana(text) + romaji = text = normalize_double_n(text.lower()) + if is_kana(text): + katakana = kata(romaji) # coerce to *kata*-kana + romaji = KANPAT.sub(lambda x: KANROM[x.group(0)], katakana) + # If unmodified, maybe it's a Kunrei-shiki Romaji -- convert it to a Hepburn Romaji + if romaji == text: + romaji = KUNPAT.sub(lambda x: TO_HEPBURN[x.group(0)], romaji) # Remove unnecessary apostrophes - tmp = re.sub("n'(?=[^aeiuoyn]|$)", "n", tmp) - - # If unmodified, it's a Kunrei-shiki Romaji -- convert it to a Hepburn Romaji - if tmp == str: - tmp = tmp.lower() - tmp = normalize_double_n(tmp) - tmp = KUNPAT.sub(lambda x: TO_HEPBURN[x.group(0)], tmp) - - return tmp + return re.sub("n'(?=[^aeiuoyn]|$)", "n", romaji) def to_kunrei(str): """ Convert a Kana (仮名) or a Hepburn Romaji (ヘボン式ローマ字) to a Kunrei-shiki Romaji (訓令式ローマ字). """ - tmp = str + tmp = kata(str) tmp = KANPAT.sub(lambda x: KANROM[x.group(0)], tmp) - tmp = KANPAT_H.sub(lambda x: KANROM_H[x.group(0)], tmp) # Remove unnecessary apostrophes tmp = re.sub("n'(?=[^aeiuoyn]|$)", "n", tmp) @@ -535,41 +142,10 @@ def to_roma(str): Convert a Kana (仮名) to a Hepburn Romaji (ヘボン式ローマ字). """ - tmp = str + tmp = kata(str) tmp = KANPAT.sub(lambda x: KANROM[x.group(0)], tmp) - tmp = KANPAT_H.sub(lambda x: KANROM_H[x.group(0)], tmp) # Remove unnecessary apostrophes tmp = re.sub("n'(?=[^aeiuoyn]|$)", "n", tmp) return tmp - -def is_consonant(str): - """ - Return a MatchObject if a Latin letter is a consonant in Japanese. - Return None otherwise. - """ - - str = str.lower() - - return re.match("[ckgszjtdhfpbmyrwxn]", str) - -def is_vowel(str): - """ - Return a MatchObject if a Latin letter is a vowel in Japanese. - Return None otherwise. - """ - - str = str.lower() - - return re.match("[aeiou]", str) - -def expand_consonant(str): - """ - Expand consonant to its related moras. - Example: 'sh' => ['sha', 'she', 'shi', 'sho', 'shu'] - """ - - str = str.lower() - - return sorted([mora for mora in ROMKAN.keys() if re.match("^%s.$" % str, mora)]) diff --git a/src/romkan/compat.py b/src/romkan/compat.py new file mode 100644 index 0000000..44e0cc3 --- /dev/null +++ b/src/romkan/compat.py @@ -0,0 +1,14 @@ +import sys +import string + +PY2 = sys.version_info[0] <= 2 + +letters = string.letters if PY2 else string.ascii_letters +if PY2: + from unicodecsv import DictReader + unichr = unichr + unicode = unicode +else: + from csv import DictReader + unichr = chr + unicode = str diff --git a/src/romkan/data.py b/src/romkan/data.py new file mode 100644 index 0000000..fdea3b2 --- /dev/null +++ b/src/romkan/data.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +import os +import re +from romkan.compat import DictReader, unichr, unicode + + +HERE = os.path.abspath(os.path.dirname(__file__)) +MAPPING = os.path.join(HERE, 'mapping.csv') +_len_cmp = lambda x: -len(x) + + +common = [unichr(c) for c in (0x3099, 0x309A, 0x309B, 0x309C, 0x30A0, 0x30FC)] +# Note: code points that would correspond to Hiragana VA (0x3097) and VI (0x3098) are unused +hiragana = [unichr(i) for i in range(0x3041, 0x3097)] + ['ゔぁ', 'ゔぃ', 'ゝ', 'ゞ'] + common +katakana = [unichr(i) for i in range(0x30A1, 0x30F9)] + ['ヽ', 'ヾ'] + common + + +h2k_dct = dict(zip(hiragana, katakana)) +k2h_dct = dict(zip(katakana, hiragana)) +to_hira_dct = dict(zip(hiragana + katakana, hiragana * 2)) +to_kata_dct = dict(zip(hiragana + katakana, katakana * 2)) + + +def get_converter(dct, strict=True): + def convert(text): + if not strict: + pat = re.compile("|".join(sorted(dct, key=_len_cmp))) + return pat.sub(lambda x: dct[x.group(0)], text) + assert isinstance(text, unicode) and len(text) > 0 + try: + return ''.join([dct[c] for c in text]) + except KeyError: + msg = hex(ord(c)) if len(text) == 1 else text + raise UnicodeError("Invalid code point supplied: %s" % msg) + return convert + +h2k, k2h, hira_strict, kata_strict = map(get_converter, (h2k_dct, k2h_dct, to_hira_dct, to_kata_dct)) +kata = get_converter(to_kata_dct, strict=False) + +KANROM = dict([(r['Katakana'], r['Kunrei']) for r in DictReader(open(MAPPING))]) +ROMKAN = dict([(r['Kunrei'], r['Katakana']) for r in DictReader(open(MAPPING))]) + +# FIXME - this replicates previous behavior but is incorrect because it mixes Hepburn and Kunrei schemes +KANROM.update(dict([(r['Katakana'], r['Hepburn']) for r in DictReader(open(MAPPING))])) +ROMKAN.update(dict([(r['Hepburn'], r['Katakana']) for r in DictReader(open(MAPPING))])) + +# special modification +# wo -> ヲ, but ヲ/ウォ -> wo +# du -> ヅ, but ヅ/ドゥ -> du +# we -> ウェ, ウェ -> we +ROMKAN.update( {"du": "ヅ", "di": "ヂ", "fu": "フ", "ti": "チ", + "wi": "ウィ", "we": "ウェ", "wo": "ヲ" } ) + + +KUNREI = [r['Kunrei'] for r in DictReader(open(MAPPING))] +HEPBURN = [r['Hepburn'] for r in DictReader(open(MAPPING))] + + +if __name__ == "__main__": + for row in DictReader(open('mapping.csv')): + h, k = row['Hiragana'], row['Katakana'] + assert k == h2k(h) and h == k2h(k) + assert k == kata_strict(k) and h == hira_strict(h) + assert h == hira_strict(kata_strict(h)) + assert k == kata_strict(hira_strict(k)) diff --git a/src/romkan/mapping.csv b/src/romkan/mapping.csv new file mode 100644 index 0000000..b0be6d2 --- /dev/null +++ b/src/romkan/mapping.csv @@ -0,0 +1,238 @@ +Katakana,Hiragana,Kunrei,Hepburn +ァ,ぁ,xa,xa +ウ,う,u,u +ヴォ,ゔぉ,vo,vo +カ,か,ka,ka +キョ,きょ,kyo,kyo +ク,く,ku,ku +ゴ,ご,go,go +サ,さ,sa,sa +ショ,しょ,syo,sho +ジ,じ,zi,ji +ス,す,su,su +ゾ,ぞ,zo,zo +タ,た,ta,ta +チョ,ちょ,tyo,cho +ティ,てぃ,ti,ti +ッ,っ,xtu,xtsu +ッヴ,っゔ,vvu,vvu +ッヴェ,っゔぇ,vve,vve +ッカ,っか,kka,kka +ッキュ,っきゅ,kkyu,kkyu +ッギュ,っぎゅ,ggyu,ggyu +ッケ,っけ,kke,kke +ッザ,っざ,zza,zza +ッシュ,っしゅ,ssyu,sshu +ッジ,っじ,zzi,jji +ッス,っす,ssu,ssu +ッゾ,っぞ,zzo,zzo +ッチャ,っちゃ,ttya,ccha +ッヂャ,っぢゃ,ddya,ddya +ッヅ,っづ,ddu,ddu +ッドゥ,っどぅ,ddu,ddu +ッハ,っは,hha,hha +ッヒャ,っひゃ,hhya,hhya +ッビャ,っびゃ,bbya,bbya +ッピャ,っぴゃ,ppya,ppya +ッファ,っふぁ,ffa,ffa +ッブ,っぶ,bbu,bbu +ッホ,っほ,hho,hho +ッヨ,っよ,yyo,yyo +ッリュ,っりゅ,rryu,rryu +ッロ,っろ,rro,rro +ツ,つ,tu,tsu +ド,ど,do,do +ナ,な,na,na +ヌ,ぬ,nu,nu +ハ,は,ha,ha +ヒュ,ひゅ,hyu,hyu +ビョ,びょ,byo,byo +フ,ふ,hu,fu +フュ,ふゅ,fu,fu +ブ,ぶ,bu,bu +ホ,ほ,ho,ho +マ,ま,ma,ma +ム,む,mu,mu +ャ,ゃ,xya,xya +ヨ,よ,yo,yo +ラ,ら,ra,ra +ル,る,ru,ru +ヮ,ゎ,xwa,xwa +ヲ,を,wo,wo +ン,ん,n',n' +ディ,でぃ,dyi,di +ー,ー,-,- +チェ,ちぇ,tye,che +ッチェ,っちぇ,ttye,cche +ジェ,じぇ,zye,je +ア,あ,a,a +ヴ,ゔ,vu,vu +ェ,ぇ,xe,xe +ガ,が,ga,ga +ギ,ぎ,gi,gi +グ,ぐ,gu,gu +ザ,ざ,za,za +シェ,しぇ,sye,she +ジャ,じゃ,zya,ja +ズ,ず,zu,zu +ダ,だ,da,da +ヂ,ぢ,di,di +ッヴァ,っゔぁ,vva,vva +ッヴォ,っゔぉ,vvo,vvo +ッガ,っが,gga,gga +ッキョ,っきょ,kkyo,kkyo +ッギョ,っぎょ,ggyo,ggyo +ッゲ,っげ,gge,gge +ッシ,っし,ssi,sshi +ッショ,っしょ,ssyo,ssho +ッジャ,っじゃ,zzya,jja +ッズ,っず,zzu,zzu +ッタ,った,tta,tta +ッチュ,っちゅ,ttyu,cchu +ッヂュ,っぢゅ,ddyu,ddyu +ッテ,って,tte,tte +ッバ,っば,bba,bba +ッヒュ,っひゅ,hhyu,hhyu +ッビュ,っびゅ,bbyu,bbyu +ッピュ,っぴゅ,ppyu,ppyu +ッフィ,っふぃ,ffi,ffi +ップ,っぷ,ppu,ppu +ッボ,っぼ,bbo,bbo +ッラ,っら,rra,rra +ッリョ,っりょ,rryo,rryo +ヅ,づ,du,du +ドゥ,どぅ,du,du +ニ,に,ni,ni +ネ,ね,ne,ne +バ,ば,ba,ba +ヒョ,ひょ,hyo,hyo +ピ,ぴ,pi,pi +ファ,ふぁ,fa,fa +プ,ぷ,pu,pu +ボ,ぼ,bo,bo +ミ,み,mi,mi +メ,め,me,me +ヤ,や,ya,ya +リ,り,ri,ri +レ,れ,re,re +ワ,わ,wa,wa +ウォ,うぉ,wo,wo +ウェ,うぇ,we,we +ィ,ぃ,xi,xi +ヴァ,ゔぁ,va,va +エ,え,e,e +キ,き,ki,ki +ギャ,ぎゃ,gya,gya +ケ,け,ke,ke +シ,し,si,shi +ジュ,じゅ,zyu,ju +セ,せ,se,se +チ,ち,ti,chi +ヂャ,ぢゃ,dya,dya +ッヴィ,っゔぃ,vvi,vvi +ッキ,っき,kki,kki +ッギ,っぎ,ggi,ggi +ック,っく,kku,kku +ッコ,っこ,kko,kko +ッシャ,っしゃ,ssya,ssha +ッシェ,っしぇ,ssye,sshe +ッジュ,っじゅ,zzyu,jju +ッセ,っせ,sse,sse +ッダ,っだ,dda,dda +ッチョ,っちょ,ttyo,ccho +ッヂョ,っぢょ,ddyo,ddyo +ッデ,っで,dde,dde +ッパ,っぱ,ppa,ppa +ッヒョ,っひょ,hhyo,hhyo +ッビョ,っびょ,bbyo,bbyo +ッピョ,っぴょ,ppyo,ppyo +ッフェ,っふぇ,ffe,ffe +ッヘ,っへ,hhe,hhe +ッポ,っぽ,ppo,ppo +ッリ,っり,rri,rri +ッル,っる,rru,rru +テ,て,te,te +ニャ,にゃ,nya,nya +ノ,の,no,no +パ,ぱ,pa,pa +ビ,び,bi,bi +ピャ,ぴゃ,pya,pya +フィ,ふぃ,fi,fi +ヘ,へ,he,he +ポ,ぽ,po,po +ミャ,みゃ,mya,mya +モ,も,mo,mo +ュ,ゅ,xyu,xyu +リャ,りゃ,rya,rya +ロ,ろ,ro,ro +ウィ,うぃ,wi,wi +ン,ん,n,n' +イ,い,i,i +ヴィ,ゔぃ,vi,vi +ォ,ぉ,xo,xo +キャ,きゃ,kya,kya +ギュ,ぎゅ,gyu,gyu +ゲ,げ,ge,ge +シャ,しゃ,sya,sha +ジョ,じょ,zyo,jo +ゼ,ぜ,ze,ze +チャ,ちゃ,tya,cha +ヂュ,ぢゅ,dyu,dyu +ッキャ,っきゃ,kkya,kkya +ッギャ,っぎゃ,ggya,ggya +ッグ,っぐ,ggu,ggu +ッゴ,っご,ggo,ggo +ッジョ,っじょ,zzyo,jjo +ッゼ,っぜ,zze,zze +ッチ,っち,tti,cchi +ッヂ,っぢ,ddi,ddi +ッツ,っつ,ttu,ttsu +ット,っと,tto,tto +ッヒ,っひ,hhi,hhi +ッビ,っび,bbi,bbi +ッピ,っぴ,ppi,ppi +ッフ,っふ,hhu,ffu +ッフォ,っふぉ,ffo,ffo +ッベ,っべ,bbe,bbe +ッヤ,っや,yya,yya +ッリャ,っりゃ,rrya,rrya +ッレ,っれ,rre,rre +デ,で,de,de +ニュ,にゅ,nyu,nyu +ヒ,ひ,hi,hi +ビャ,びゃ,bya,bya +ピュ,ぴゅ,pyu,pyu +フェ,ふぇ,fe,fe +ベ,べ,be,be +ミュ,みゅ,myu,myu +ユ,ゆ,yu,yu +リュ,りゅ,ryu,ryu +ヰ,ゐ,wi,wi +ゥ,ぅ,xu,xu +ヴェ,ゔぇ,ve,ve +オ,お,o,o +キュ,きゅ,kyu,kyu +ギョ,ぎょ,gyo,gyo +コ,こ,ko,ko +シュ,しゅ,syu,shu +ソ,そ,so,so +チュ,ちゅ,tyu,chu +ヂョ,ぢょ,dyo,dyo +ッサ,っさ,ssa,ssa +ッソ,っそ,sso,sso +ッティ,ってぃ,tti,tti +ッド,っど,ddo,ddo +ッフュ,っふゅ,ffu,ffu +ッペ,っぺ,ppe,ppe +ッユ,っゆ,yyu,yyu +ト,と,to,to +ニョ,にょ,nyo,nyo +ヒャ,ひゃ,hya,hya +ビュ,びゅ,byu,byu +ピョ,ぴょ,pyo,pyo +フォ,ふぉ,fo,fo +ペ,ぺ,pe,pe +ミョ,みょ,myo,myo +ョ,ょ,xyo,xyo +リョ,りょ,ryo,ryo +ヱ,ゑ,we,we diff --git a/src/romkan/utils.py b/src/romkan/utils.py new file mode 100644 index 0000000..7df8ef1 --- /dev/null +++ b/src/romkan/utils.py @@ -0,0 +1,44 @@ +import re +from romkan.data import ROMKAN, katakana, hiragana +from romkan.compat import letters + + +def is_consonant(c): + """ + Return a MatchObject if a Latin letter is a consonant in Japanese. + Return None otherwise. + """ + return re.match("[ckgszjtdhfpbmyrwxn]", c.lower()) + + +def is_vowel(c): + """ + Return a MatchObject if a Latin letter is a vowel in Japanese. + Return None otherwise. + """ + return re.match("[aeiou]", c.lower()) + + +def expand_consonant(c): + """ + Expand consonant to its related moras. + Example: 'sh' => ['sha', 'she', 'shi', 'sho', 'shu'] + """ + c = c.lower() + return sorted([mora for mora in ROMKAN.keys() if re.match("^%s.$" % c, mora)]) + + +def is_romaji(text): + return all([c in letters for c in text]) + + +def is_katakana(text): + return all([c in katakana for c in text]) + + +def is_hiragana(text): + return all([c in katakana for c in text]) + + +def is_kana(text): + return all([c in katakana or c in hiragana for c in text]) diff --git a/tests/test.py b/tests/test.py index 1e7a11b..2855653 100755 --- a/tests/test.py +++ b/tests/test.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import unittest import sys import os @@ -14,7 +13,8 @@ os.path.dirname( os.path.abspath(__file__))))) from romkan import * - +from romkan.utils import expand_consonant, is_consonant, is_vowel +import unittest class RomkanTestCase(unittest.TestCase): def test_to_katakana(self): From 7715b18c4d0c3f04ed67bd765a1b0a3f074a6bec Mon Sep 17 00:00:00 2001 From: Baptiste Lagarde Date: Sun, 2 Feb 2014 21:55:41 +1100 Subject: [PATCH 2/2] FIX: is_hiragana() --- src/romkan/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/romkan/utils.py b/src/romkan/utils.py index 7df8ef1..9028cf2 100644 --- a/src/romkan/utils.py +++ b/src/romkan/utils.py @@ -37,7 +37,7 @@ def is_katakana(text): def is_hiragana(text): - return all([c in katakana for c in text]) + return all([c in hiragana for c in text]) def is_kana(text):