Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@

PROJ_METADATA = '%s.json' % PROJ_NAME

import os, json, imp
import os, json, imp, sys

here = os.path.abspath(os.path.dirname(__file__))
proj_info = json.loads(open(os.path.join(here, PROJ_METADATA)).read())
README = open(os.path.join(here, 'README.rst')).read()
CHANGELOG = open(os.path.join(here, 'CHANGELOG.rst')).read()
VERSION = imp.load_source('version', os.path.join(here, 'src/%s/version.py' % PACKAGE_NAME)).__version__

PY2 = sys.version_info[0] <= 2
deps = ['unicodecsv'] if PY2 else []

from setuptools import setup, find_packages
setup(
name = proj_info['name'],
Expand All @@ -32,6 +35,7 @@
package_dir = {'' : 'src'},

test_suite = 'tests',
install_requires = deps,

platforms = 'any',
zip_safe = False,
Expand Down
458 changes: 17 additions & 441 deletions src/romkan/common.py

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions src/romkan/compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import sys
import string

PY2 = sys.version_info[0] <= 2

letters = string.letters if PY2 else string.ascii_letters
if PY2:
from unicodecsv import DictReader
unichr = unichr
unicode = unicode
else:
from csv import DictReader
unichr = chr
unicode = str
67 changes: 67 additions & 0 deletions src/romkan/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import re
from romkan.compat import DictReader, unichr, unicode


HERE = os.path.abspath(os.path.dirname(__file__))
MAPPING = os.path.join(HERE, 'mapping.csv')
_len_cmp = lambda x: -len(x)


common = [unichr(c) for c in (0x3099, 0x309A, 0x309B, 0x309C, 0x30A0, 0x30FC)]
# Note: code points that would correspond to Hiragana VA (0x3097) and VI (0x3098) are unused
hiragana = [unichr(i) for i in range(0x3041, 0x3097)] + ['ゔぁ', 'ゔぃ', 'ゝ', 'ゞ'] + common
katakana = [unichr(i) for i in range(0x30A1, 0x30F9)] + ['ヽ', 'ヾ'] + common


h2k_dct = dict(zip(hiragana, katakana))
k2h_dct = dict(zip(katakana, hiragana))
to_hira_dct = dict(zip(hiragana + katakana, hiragana * 2))
to_kata_dct = dict(zip(hiragana + katakana, katakana * 2))


def get_converter(dct, strict=True):
def convert(text):
if not strict:
pat = re.compile("|".join(sorted(dct, key=_len_cmp)))
return pat.sub(lambda x: dct[x.group(0)], text)
assert isinstance(text, unicode) and len(text) > 0
try:
return ''.join([dct[c] for c in text])
except KeyError:
msg = hex(ord(c)) if len(text) == 1 else text
raise UnicodeError("Invalid code point supplied: %s" % msg)
return convert

h2k, k2h, hira_strict, kata_strict = map(get_converter, (h2k_dct, k2h_dct, to_hira_dct, to_kata_dct))
kata = get_converter(to_kata_dct, strict=False)

KANROM = dict([(r['Katakana'], r['Kunrei']) for r in DictReader(open(MAPPING))])
ROMKAN = dict([(r['Kunrei'], r['Katakana']) for r in DictReader(open(MAPPING))])

# FIXME - this replicates previous behavior but is incorrect because it mixes Hepburn and Kunrei schemes
KANROM.update(dict([(r['Katakana'], r['Hepburn']) for r in DictReader(open(MAPPING))]))
ROMKAN.update(dict([(r['Hepburn'], r['Katakana']) for r in DictReader(open(MAPPING))]))

# special modification
# wo -> ヲ, but ヲ/ウォ -> wo
# du -> ヅ, but ヅ/ドゥ -> du
# we -> ウェ, ウェ -> we
ROMKAN.update( {"du": "ヅ", "di": "ヂ", "fu": "フ", "ti": "チ",
"wi": "ウィ", "we": "ウェ", "wo": "ヲ" } )


KUNREI = [r['Kunrei'] for r in DictReader(open(MAPPING))]
HEPBURN = [r['Hepburn'] for r in DictReader(open(MAPPING))]


if __name__ == "__main__":
for row in DictReader(open('mapping.csv')):
h, k = row['Hiragana'], row['Katakana']
assert k == h2k(h) and h == k2h(k)
assert k == kata_strict(k) and h == hira_strict(h)
assert h == hira_strict(kata_strict(h))
assert k == kata_strict(hira_strict(k))
238 changes: 238 additions & 0 deletions src/romkan/mapping.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
Katakana,Hiragana,Kunrei,Hepburn
ァ,ぁ,xa,xa
ウ,う,u,u
ヴォ,ゔぉ,vo,vo
カ,か,ka,ka
キョ,きょ,kyo,kyo
ク,く,ku,ku
ゴ,ご,go,go
サ,さ,sa,sa
ショ,しょ,syo,sho
ジ,じ,zi,ji
ス,す,su,su
ゾ,ぞ,zo,zo
タ,た,ta,ta
チョ,ちょ,tyo,cho
ティ,てぃ,ti,ti
ッ,っ,xtu,xtsu
ッヴ,っゔ,vvu,vvu
ッヴェ,っゔぇ,vve,vve
ッカ,っか,kka,kka
ッキュ,っきゅ,kkyu,kkyu
ッギュ,っぎゅ,ggyu,ggyu
ッケ,っけ,kke,kke
ッザ,っざ,zza,zza
ッシュ,っしゅ,ssyu,sshu
ッジ,っじ,zzi,jji
ッス,っす,ssu,ssu
ッゾ,っぞ,zzo,zzo
ッチャ,っちゃ,ttya,ccha
ッヂャ,っぢゃ,ddya,ddya
ッヅ,っづ,ddu,ddu
ッドゥ,っどぅ,ddu,ddu
ッハ,っは,hha,hha
ッヒャ,っひゃ,hhya,hhya
ッビャ,っびゃ,bbya,bbya
ッピャ,っぴゃ,ppya,ppya
ッファ,っふぁ,ffa,ffa
ッブ,っぶ,bbu,bbu
ッホ,っほ,hho,hho
ッヨ,っよ,yyo,yyo
ッリュ,っりゅ,rryu,rryu
ッロ,っろ,rro,rro
ツ,つ,tu,tsu
ド,ど,do,do
ナ,な,na,na
ヌ,ぬ,nu,nu
ハ,は,ha,ha
ヒュ,ひゅ,hyu,hyu
ビョ,びょ,byo,byo
フ,ふ,hu,fu
フュ,ふゅ,fu,fu
ブ,ぶ,bu,bu
ホ,ほ,ho,ho
マ,ま,ma,ma
ム,む,mu,mu
ャ,ゃ,xya,xya
ヨ,よ,yo,yo
ラ,ら,ra,ra
ル,る,ru,ru
ヮ,ゎ,xwa,xwa
ヲ,を,wo,wo
ン,ん,n',n'
ディ,でぃ,dyi,di
ー,ー,-,-
チェ,ちぇ,tye,che
ッチェ,っちぇ,ttye,cche
ジェ,じぇ,zye,je
ア,あ,a,a
ヴ,ゔ,vu,vu
ェ,ぇ,xe,xe
ガ,が,ga,ga
ギ,ぎ,gi,gi
グ,ぐ,gu,gu
ザ,ざ,za,za
シェ,しぇ,sye,she
ジャ,じゃ,zya,ja
ズ,ず,zu,zu
ダ,だ,da,da
ヂ,ぢ,di,di
ッヴァ,っゔぁ,vva,vva
ッヴォ,っゔぉ,vvo,vvo
ッガ,っが,gga,gga
ッキョ,っきょ,kkyo,kkyo
ッギョ,っぎょ,ggyo,ggyo
ッゲ,っげ,gge,gge
ッシ,っし,ssi,sshi
ッショ,っしょ,ssyo,ssho
ッジャ,っじゃ,zzya,jja
ッズ,っず,zzu,zzu
ッタ,った,tta,tta
ッチュ,っちゅ,ttyu,cchu
ッヂュ,っぢゅ,ddyu,ddyu
ッテ,って,tte,tte
ッバ,っば,bba,bba
ッヒュ,っひゅ,hhyu,hhyu
ッビュ,っびゅ,bbyu,bbyu
ッピュ,っぴゅ,ppyu,ppyu
ッフィ,っふぃ,ffi,ffi
ップ,っぷ,ppu,ppu
ッボ,っぼ,bbo,bbo
ッラ,っら,rra,rra
ッリョ,っりょ,rryo,rryo
ヅ,づ,du,du
ドゥ,どぅ,du,du
ニ,に,ni,ni
ネ,ね,ne,ne
バ,ば,ba,ba
ヒョ,ひょ,hyo,hyo
ピ,ぴ,pi,pi
ファ,ふぁ,fa,fa
プ,ぷ,pu,pu
ボ,ぼ,bo,bo
ミ,み,mi,mi
メ,め,me,me
ヤ,や,ya,ya
リ,り,ri,ri
レ,れ,re,re
ワ,わ,wa,wa
ウォ,うぉ,wo,wo
ウェ,うぇ,we,we
ィ,ぃ,xi,xi
ヴァ,ゔぁ,va,va
エ,え,e,e
キ,き,ki,ki
ギャ,ぎゃ,gya,gya
ケ,け,ke,ke
シ,し,si,shi
ジュ,じゅ,zyu,ju
セ,せ,se,se
チ,ち,ti,chi
ヂャ,ぢゃ,dya,dya
ッヴィ,っゔぃ,vvi,vvi
ッキ,っき,kki,kki
ッギ,っぎ,ggi,ggi
ック,っく,kku,kku
ッコ,っこ,kko,kko
ッシャ,っしゃ,ssya,ssha
ッシェ,っしぇ,ssye,sshe
ッジュ,っじゅ,zzyu,jju
ッセ,っせ,sse,sse
ッダ,っだ,dda,dda
ッチョ,っちょ,ttyo,ccho
ッヂョ,っぢょ,ddyo,ddyo
ッデ,っで,dde,dde
ッパ,っぱ,ppa,ppa
ッヒョ,っひょ,hhyo,hhyo
ッビョ,っびょ,bbyo,bbyo
ッピョ,っぴょ,ppyo,ppyo
ッフェ,っふぇ,ffe,ffe
ッヘ,っへ,hhe,hhe
ッポ,っぽ,ppo,ppo
ッリ,っり,rri,rri
ッル,っる,rru,rru
テ,て,te,te
ニャ,にゃ,nya,nya
ノ,の,no,no
パ,ぱ,pa,pa
ビ,び,bi,bi
ピャ,ぴゃ,pya,pya
フィ,ふぃ,fi,fi
ヘ,へ,he,he
ポ,ぽ,po,po
ミャ,みゃ,mya,mya
モ,も,mo,mo
ュ,ゅ,xyu,xyu
リャ,りゃ,rya,rya
ロ,ろ,ro,ro
ウィ,うぃ,wi,wi
ン,ん,n,n'
イ,い,i,i
ヴィ,ゔぃ,vi,vi
ォ,ぉ,xo,xo
キャ,きゃ,kya,kya
ギュ,ぎゅ,gyu,gyu
ゲ,げ,ge,ge
シャ,しゃ,sya,sha
ジョ,じょ,zyo,jo
ゼ,ぜ,ze,ze
チャ,ちゃ,tya,cha
ヂュ,ぢゅ,dyu,dyu
ッキャ,っきゃ,kkya,kkya
ッギャ,っぎゃ,ggya,ggya
ッグ,っぐ,ggu,ggu
ッゴ,っご,ggo,ggo
ッジョ,っじょ,zzyo,jjo
ッゼ,っぜ,zze,zze
ッチ,っち,tti,cchi
ッヂ,っぢ,ddi,ddi
ッツ,っつ,ttu,ttsu
ット,っと,tto,tto
ッヒ,っひ,hhi,hhi
ッビ,っび,bbi,bbi
ッピ,っぴ,ppi,ppi
ッフ,っふ,hhu,ffu
ッフォ,っふぉ,ffo,ffo
ッベ,っべ,bbe,bbe
ッヤ,っや,yya,yya
ッリャ,っりゃ,rrya,rrya
ッレ,っれ,rre,rre
デ,で,de,de
ニュ,にゅ,nyu,nyu
ヒ,ひ,hi,hi
ビャ,びゃ,bya,bya
ピュ,ぴゅ,pyu,pyu
フェ,ふぇ,fe,fe
ベ,べ,be,be
ミュ,みゅ,myu,myu
ユ,ゆ,yu,yu
リュ,りゅ,ryu,ryu
ヰ,ゐ,wi,wi
ゥ,ぅ,xu,xu
ヴェ,ゔぇ,ve,ve
オ,お,o,o
キュ,きゅ,kyu,kyu
ギョ,ぎょ,gyo,gyo
コ,こ,ko,ko
シュ,しゅ,syu,shu
ソ,そ,so,so
チュ,ちゅ,tyu,chu
ヂョ,ぢょ,dyo,dyo
ッサ,っさ,ssa,ssa
ッソ,っそ,sso,sso
ッティ,ってぃ,tti,tti
ッド,っど,ddo,ddo
ッフュ,っふゅ,ffu,ffu
ッペ,っぺ,ppe,ppe
ッユ,っゆ,yyu,yyu
ト,と,to,to
ニョ,にょ,nyo,nyo
ヒャ,ひゃ,hya,hya
ビュ,びゅ,byu,byu
ピョ,ぴょ,pyo,pyo
フォ,ふぉ,fo,fo
ペ,ぺ,pe,pe
ミョ,みょ,myo,myo
ョ,ょ,xyo,xyo
リョ,りょ,ryo,ryo
ヱ,ゑ,we,we
44 changes: 44 additions & 0 deletions src/romkan/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import re
from romkan.data import ROMKAN, katakana, hiragana
from romkan.compat import letters


def is_consonant(c):
"""
Return a MatchObject if a Latin letter is a consonant in Japanese.
Return None otherwise.
"""
return re.match("[ckgszjtdhfpbmyrwxn]", c.lower())


def is_vowel(c):
"""
Return a MatchObject if a Latin letter is a vowel in Japanese.
Return None otherwise.
"""
return re.match("[aeiou]", c.lower())


def expand_consonant(c):
"""
Expand consonant to its related moras.
Example: 'sh' => ['sha', 'she', 'shi', 'sho', 'shu']
"""
c = c.lower()
return sorted([mora for mora in ROMKAN.keys() if re.match("^%s.$" % c, mora)])


def is_romaji(text):
return all([c in letters for c in text])


def is_katakana(text):
return all([c in katakana for c in text])


def is_hiragana(text):
return all([c in hiragana for c in text])


def is_kana(text):
return all([c in katakana or c in hiragana for c in text])
Loading