|
|
|
from dataclasses import dataclass |
|
from fugashi import Tagger |
|
from num2kana import Convert |
|
import mojimoji |
|
import re |
|
import unicodedata |
|
|
|
HEPBURN = { |
|
chr(12449):'a', |
|
chr(12450):'a', |
|
chr(12451):'i', |
|
chr(12452):'i', |
|
chr(12453):'ɯ', |
|
chr(12454):'ɯ', |
|
chr(12455):'e', |
|
chr(12456):'e', |
|
chr(12457):'o', |
|
chr(12458):'o', |
|
chr(12459):'ka', |
|
chr(12460):'ɡa', |
|
chr(12461):'ki', |
|
chr(12462):'ɡi', |
|
chr(12463):'kɯ', |
|
chr(12464):'ɡɯ', |
|
chr(12465):'ke', |
|
chr(12466):'ɡe', |
|
chr(12467):'ko', |
|
chr(12468):'ɡo', |
|
chr(12469):'sa', |
|
chr(12470):'za', |
|
chr(12471):'ɕi', |
|
chr(12472):'dʑi', |
|
chr(12473):'sɨ', |
|
chr(12474):'zɨ', |
|
chr(12475):'se', |
|
chr(12476):'ze', |
|
chr(12477):'so', |
|
chr(12478):'zo', |
|
chr(12479):'ta', |
|
chr(12480):'da', |
|
chr(12481):'tɕi', |
|
chr(12482):'dʑi', |
|
|
|
chr(12484):'tsɨ', |
|
chr(12485):'zɨ', |
|
chr(12486):'te', |
|
chr(12487):'de', |
|
chr(12488):'to', |
|
chr(12489):'do', |
|
chr(12490):'na', |
|
chr(12491):'ɲi', |
|
chr(12492):'nɯ', |
|
chr(12493):'ne', |
|
chr(12494):'no', |
|
chr(12495):'ha', |
|
chr(12496):'ba', |
|
chr(12497):'pa', |
|
chr(12498):'çi', |
|
chr(12499):'bi', |
|
chr(12500):'pi', |
|
chr(12501):'ɸɯ', |
|
chr(12502):'bɯ', |
|
chr(12503):'pɯ', |
|
chr(12504):'he', |
|
chr(12505):'be', |
|
chr(12506):'pe', |
|
chr(12507):'ho', |
|
chr(12508):'bo', |
|
chr(12509):'po', |
|
chr(12510):'ma', |
|
chr(12511):'mi', |
|
chr(12512):'mɯ', |
|
chr(12513):'me', |
|
chr(12514):'mo', |
|
chr(12515):'ja', |
|
chr(12516):'ja', |
|
chr(12517):'jɯ', |
|
chr(12518):'jɯ', |
|
chr(12519):'jo', |
|
chr(12520):'jo', |
|
chr(12521):'ra', |
|
chr(12522):'ri', |
|
chr(12523):'rɯ', |
|
chr(12524):'re', |
|
chr(12525):'ro', |
|
chr(12526):'wa', |
|
chr(12527):'wa', |
|
chr(12528):'i', |
|
chr(12529):'e', |
|
chr(12530):'o', |
|
|
|
chr(12532):'vɯ', |
|
chr(12533):'ka', |
|
chr(12534):'ke', |
|
} |
|
assert len(HEPBURN) == 84 and all(i in {12483, 12531} or chr(i) in HEPBURN for i in range(12449, 12535)) |
|
|
|
for k, v in list(HEPBURN.items()): |
|
HEPBURN[chr(ord(k)-96)] = v |
|
assert len(HEPBURN) == 84*2 |
|
|
|
HEPBURN.update({ |
|
chr(12535):'va', |
|
chr(12536):'vi', |
|
chr(12537):'ve', |
|
chr(12538):'vo', |
|
}) |
|
assert len(HEPBURN) == 84*2+4 and all(chr(i) in HEPBURN for i in range(12535, 12539)) |
|
|
|
HEPBURN.update({ |
|
chr(12784):'kɯ', |
|
chr(12785):'ɕi', |
|
chr(12786):'sɨ', |
|
chr(12787):'to', |
|
chr(12788):'nɯ', |
|
chr(12789):'ha', |
|
chr(12790):'çi', |
|
chr(12791):'ɸɯ', |
|
chr(12792):'he', |
|
chr(12793):'ho', |
|
chr(12794):'mɯ', |
|
chr(12795):'ra', |
|
chr(12796):'ri', |
|
chr(12797):'rɯ', |
|
chr(12798):'re', |
|
chr(12799):'ro', |
|
}) |
|
assert len(HEPBURN) == 84*2+4+16 and all(chr(i) in HEPBURN for i in range(12784, 12800)) |
|
|
|
HEPBURN.update({ |
|
chr(12452)+chr(12455):'je', |
|
chr(12454)+chr(12451):'wi', |
|
chr(12454)+chr(12455):'we', |
|
chr(12454)+chr(12457):'wo', |
|
chr(12461)+chr(12455):'kʲe', |
|
chr(12461)+chr(12515):'kʲa', |
|
chr(12461)+chr(12517):'kʲɨ', |
|
chr(12461)+chr(12519):'kʲo', |
|
chr(12462)+chr(12515):'ɡʲa', |
|
chr(12462)+chr(12517):'ɡʲɨ', |
|
chr(12462)+chr(12519):'ɡʲo', |
|
chr(12463)+chr(12449):'kʷa', |
|
chr(12463)+chr(12451):'kʷi', |
|
chr(12463)+chr(12455):'kʷe', |
|
chr(12463)+chr(12457):'kʷo', |
|
chr(12464)+chr(12449):'ɡʷa', |
|
chr(12464)+chr(12451):'ɡʷi', |
|
chr(12464)+chr(12455):'ɡʷe', |
|
chr(12464)+chr(12457):'ɡʷo', |
|
chr(12471)+chr(12455):'ɕe', |
|
chr(12471)+chr(12515):'ɕa', |
|
chr(12471)+chr(12517):'ɕɨ', |
|
chr(12471)+chr(12519):'ɕo', |
|
chr(12472)+chr(12455):'dʑe', |
|
chr(12472)+chr(12515):'dʑa', |
|
chr(12472)+chr(12517):'dʑɨ', |
|
chr(12472)+chr(12519):'dʑo', |
|
chr(12481)+chr(12455):'tɕe', |
|
chr(12481)+chr(12515):'tɕa', |
|
chr(12481)+chr(12517):'tɕɨ', |
|
chr(12481)+chr(12519):'tɕo', |
|
chr(12482)+chr(12515):'dʑa', |
|
chr(12482)+chr(12517):'dʑɨ', |
|
chr(12482)+chr(12519):'dʑo', |
|
chr(12484)+chr(12449):'tsa', |
|
chr(12484)+chr(12451):'tsi', |
|
chr(12484)+chr(12455):'tse', |
|
chr(12484)+chr(12457):'tso', |
|
chr(12486)+chr(12451):'ti', |
|
chr(12486)+chr(12517):'tʲɨ', |
|
chr(12487)+chr(12451):'di', |
|
chr(12487)+chr(12517):'dʲɨ', |
|
chr(12488)+chr(12453):'tɯ', |
|
chr(12489)+chr(12453):'dɯ', |
|
chr(12491)+chr(12455):'ɲe', |
|
chr(12491)+chr(12515):'ɲa', |
|
chr(12491)+chr(12517):'ɲɨ', |
|
chr(12491)+chr(12519):'ɲo', |
|
chr(12498)+chr(12455):'çe', |
|
chr(12498)+chr(12515):'ça', |
|
chr(12498)+chr(12517):'çɨ', |
|
chr(12498)+chr(12519):'ço', |
|
chr(12499)+chr(12515):'bʲa', |
|
chr(12499)+chr(12517):'bʲɨ', |
|
chr(12499)+chr(12519):'bʲo', |
|
chr(12500)+chr(12515):'pʲa', |
|
chr(12500)+chr(12517):'pʲɨ', |
|
chr(12500)+chr(12519):'pʲo', |
|
chr(12501)+chr(12449):'ɸa', |
|
chr(12501)+chr(12451):'ɸi', |
|
chr(12501)+chr(12455):'ɸe', |
|
chr(12501)+chr(12457):'ɸo', |
|
chr(12501)+chr(12517):'ɸʲɨ', |
|
chr(12501)+chr(12519):'ɸʲo', |
|
chr(12511)+chr(12515):'mʲa', |
|
chr(12511)+chr(12517):'mʲɨ', |
|
chr(12511)+chr(12519):'mʲo', |
|
chr(12522)+chr(12515):'rʲa', |
|
chr(12522)+chr(12517):'rʲɨ', |
|
chr(12522)+chr(12519):'rʲo', |
|
chr(12532)+chr(12449):'va', |
|
chr(12532)+chr(12451):'vi', |
|
chr(12532)+chr(12455):'ve', |
|
chr(12532)+chr(12457):'vo', |
|
chr(12532)+chr(12517):'vʲɨ', |
|
chr(12532)+chr(12519):'vʲo', |
|
}) |
|
assert len(HEPBURN) == 84*2+4+16+76 |
|
|
|
for k, v in list(HEPBURN.items()): |
|
if len(k) != 2: |
|
continue |
|
a, b = k |
|
assert a in HEPBURN and b in HEPBURN, (a, b) |
|
a = chr(ord(a)-96) |
|
b = chr(ord(b)-96) |
|
assert a in HEPBURN and b in HEPBURN, (a, b) |
|
HEPBURN[a+b] = v |
|
assert len(HEPBURN) == 84*2+4+16+76*2 |
|
|
|
HEPBURN.update({ |
|
|
|
|
|
'。': '.', |
|
'、': ',', |
|
'?': '?', |
|
'!': '!', |
|
'「': '"', |
|
'」': '"', |
|
'『': '"', |
|
'』': '"', |
|
':': ':', |
|
';': ';', |
|
'(': '(', |
|
')': ')', |
|
'《': '(', |
|
'》': ')', |
|
'【': '[', |
|
'】': ']', |
|
'・': ' ', |
|
',': ',', |
|
'~': '—', |
|
'〜': '—', |
|
'—': '—', |
|
'«': '«', |
|
'»': '»', |
|
|
|
|
|
'゚': '', |
|
'゙': '', |
|
}) |
|
|
|
def add_dakuten(kk): |
|
"""Given a kana (single-character string), add a dakuten.""" |
|
try: |
|
|
|
ii = 'カキクケコサシスセソタチツテトハヒフヘホ'.index(kk) |
|
return 'ガギグゲゴザジズゼゾダヂヅデドバビブベボ'[ii] |
|
|
|
except ValueError: |
|
|
|
return None |
|
|
|
SUTEGANA = 'ャュョァィゥェォ' |
|
PUNCT = '\'".!?(),;:-' |
|
ODORI = '々〃ゝゞヽゞ' |
|
|
|
@dataclass |
|
class Token: |
|
surface: str |
|
space: bool |
|
def __str__(self): |
|
sp = " " if self.space else "" |
|
return f"{self.surface}{sp}" |
|
|
|
class Katsu: |
|
def __init__(self): |
|
"""Create a Katsu object, which holds configuration as well as |
|
tokenizer state. |
|
|
|
Typical usage: |
|
|
|
```python |
|
katsu = Katsu() |
|
roma = katsu.romaji("カツカレーを食べた") |
|
# "Cutlet curry wo tabeta" |
|
``` |
|
""" |
|
self.tagger = Tagger() |
|
self.table = dict(HEPBURN) |
|
self.exceptions = {} |
|
|
|
def romaji(self, text): |
|
"""Build a complete string from input text.""" |
|
if not text: |
|
return '' |
|
text = self._normalize_text(text) |
|
words = self.tagger(text) |
|
tokens = self._romaji_tokens(words) |
|
out = ''.join([str(tok) for tok in tokens]) |
|
return re.sub(r'\s+', ' ', out.strip()) |
|
|
|
def phonemize(self, texts): |
|
|
|
return [self.romaji(text) for text in texts] |
|
|
|
def _normalize_text(self, text): |
|
"""Given text, normalize variations in Japanese. |
|
|
|
This specifically removes variations that are meaningless for romaji |
|
conversion using the following steps: |
|
|
|
- Unicode NFKC normalization |
|
- Full-width Latin to half-width |
|
- Half-width katakana to full-width |
|
""" |
|
|
|
text = re.sub(r'[〜~](?=\d)', 'から', text) |
|
text = unicodedata.normalize('NFKC', text) |
|
|
|
text = mojimoji.zen_to_han(text, kana=False) |
|
|
|
text = mojimoji.han_to_zen(text, digit=False, ascii=False) |
|
return ''.join([(' '+Convert(t)) if t.isdigit() else t for t in re.findall(r'\d+|\D+', text)]) |
|
|
|
def _romaji_tokens(self, words): |
|
"""Build a list of tokens from input nodes.""" |
|
out = [] |
|
for wi, word in enumerate(words): |
|
po = out[-1] if out else None |
|
pw = words[wi - 1] if wi > 0 else None |
|
nw = words[wi + 1] if wi < len(words) - 1 else None |
|
roma = self._romaji_word(word) |
|
tok = Token(roma, False) |
|
|
|
surface = word.surface |
|
if surface in '「『' or roma in '([': |
|
if po: |
|
po.space = True |
|
elif surface in '」』' or roma in ']).,?!:': |
|
if po: |
|
po.space = False |
|
tok.space = True |
|
elif roma == ' ': |
|
tok.space = False |
|
else: |
|
tok.space = True |
|
out.append(tok) |
|
|
|
for tok in out: |
|
tok.surface = tok.surface.replace(chr(12483), '') |
|
return out |
|
|
|
def _romaji_word(self, word): |
|
"""Return the romaji for a single word (node).""" |
|
surface = word.surface |
|
if surface in self.exceptions: |
|
return self.exceptions[surface] |
|
assert not surface.isdigit(), surface |
|
if surface.isascii(): |
|
return surface |
|
kana = word.feature.pron or word.feature.kana or surface |
|
if word.is_unk: |
|
if word.char_type == 7: |
|
pass |
|
elif word.char_type == 3: |
|
return ''.join(map(lambda c: self.table.get(c, c), surface)) |
|
else: |
|
return '' |
|
out = '' |
|
for ki, char in enumerate(kana): |
|
nk = kana[ki + 1] if ki < len(kana) - 1 else None |
|
pk = kana[ki - 1] if ki > 0 else None |
|
out += self._get_single_mapping(pk, char, nk) |
|
return out |
|
|
|
def _get_single_mapping(self, pk, kk, nk): |
|
"""Given a single kana and its neighbors, return the mapped romaji.""" |
|
|
|
|
|
|
|
|
|
if kk in ODORI: |
|
if kk in 'ゝヽ': |
|
if pk: return pk |
|
else: return '' |
|
if kk in 'ゞヾ': |
|
if not pk: return '' |
|
vv = add_dakuten(pk) |
|
if vv: return self.table[vv] |
|
else: return '' |
|
|
|
|
|
return '' |
|
|
|
if pk and (pk + kk) in self.table: |
|
return self.table[pk + kk] |
|
if nk and (kk + nk) in self.table: |
|
return '' |
|
if nk and nk in SUTEGANA: |
|
if kk == 'ッ': return '' |
|
return self.table[kk][:-1] + self.table[nk] |
|
if kk in SUTEGANA: |
|
return '' |
|
if kk == 'ー': |
|
return 'ː' |
|
if ord(kk) in {12387, 12483}: |
|
tnk = self.table.get(nk) |
|
if tnk and tnk[0] in 'bdɸɡhçijkmnɲoprstɯvwz': |
|
return tnk[0] |
|
return kk |
|
if ord(kk) in {12435, 12531}: |
|
|
|
|
|
|
|
|
|
|
|
|
|
tnk = self.table.get(nk) |
|
if tnk: |
|
if tnk[0] in 'mpb': |
|
return 'm' |
|
elif tnk[0] in 'kɡ': |
|
return 'ŋ' |
|
elif any(tnk.startswith(p) for p in ('ɲ','tɕ','dʑ')): |
|
return 'ɲ' |
|
elif tnk[0] in 'ntdrz': |
|
return 'n' |
|
return 'ɴ' |
|
return self.table.get(kk, '') |
|
|