|
from __future__ import annotations |
|
|
|
from fontTools.misc.textTools import byteord, tostr |
|
|
|
import re |
|
from bisect import bisect_right |
|
from typing import Literal, TypeVar, overload |
|
|
|
|
|
try: |
|
|
|
|
|
from unicodedata2 import * |
|
except ImportError: |
|
|
|
from unicodedata import * |
|
|
|
from . import Blocks, Scripts, ScriptExtensions, OTTags |
|
|
|
|
|
__all__ = [ |
|
|
|
"lookup", |
|
"name", |
|
"decimal", |
|
"digit", |
|
"numeric", |
|
"category", |
|
"bidirectional", |
|
"combining", |
|
"east_asian_width", |
|
"mirrored", |
|
"decomposition", |
|
"normalize", |
|
"unidata_version", |
|
"ucd_3_2_0", |
|
|
|
"block", |
|
"script", |
|
"script_extension", |
|
"script_name", |
|
"script_code", |
|
"script_horizontal_direction", |
|
"ot_tags_from_script", |
|
"ot_tag_to_script", |
|
] |
|
|
|
|
|
def script(char): |
|
"""Return the four-letter script code assigned to the Unicode character |
|
'char' as string. |
|
|
|
>>> script("a") |
|
'Latn' |
|
>>> script(",") |
|
'Zyyy' |
|
>>> script(chr(0x10FFFF)) |
|
'Zzzz' |
|
""" |
|
code = byteord(char) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
i = bisect_right(Scripts.RANGES, code) |
|
return Scripts.VALUES[i - 1] |
|
|
|
|
|
def script_extension(char): |
|
"""Return the script extension property assigned to the Unicode character |
|
'char' as a set of string. |
|
|
|
>>> script_extension("a") == {'Latn'} |
|
True |
|
>>> script_extension(chr(0x060C)) == {'Rohg', 'Syrc', 'Yezi', 'Arab', 'Thaa', 'Nkoo'} |
|
True |
|
>>> script_extension(chr(0x10FFFF)) == {'Zzzz'} |
|
True |
|
""" |
|
code = byteord(char) |
|
i = bisect_right(ScriptExtensions.RANGES, code) |
|
value = ScriptExtensions.VALUES[i - 1] |
|
if value is None: |
|
|
|
|
|
return {script(char)} |
|
return value |
|
|
|
|
|
def script_name(code, default=KeyError): |
|
"""Return the long, human-readable script name given a four-letter |
|
Unicode script code. |
|
|
|
If no matching name is found, a KeyError is raised by default. |
|
|
|
You can use the 'default' argument to return a fallback value (e.g. |
|
'Unknown' or None) instead of throwing an error. |
|
""" |
|
try: |
|
return str(Scripts.NAMES[code].replace("_", " ")) |
|
except KeyError: |
|
if isinstance(default, type) and issubclass(default, KeyError): |
|
raise |
|
return default |
|
|
|
|
|
_normalize_re = re.compile(r"[-_ ]+") |
|
|
|
|
|
def _normalize_property_name(string): |
|
"""Remove case, strip space, '-' and '_' for loose matching.""" |
|
return _normalize_re.sub("", string).lower() |
|
|
|
|
|
_SCRIPT_CODES = {_normalize_property_name(v): k for k, v in Scripts.NAMES.items()} |
|
|
|
|
|
def script_code(script_name, default=KeyError): |
|
"""Returns the four-letter Unicode script code from its long name |
|
|
|
If no matching script code is found, a KeyError is raised by default. |
|
|
|
You can use the 'default' argument to return a fallback string (e.g. |
|
'Zzzz' or None) instead of throwing an error. |
|
""" |
|
normalized_name = _normalize_property_name(script_name) |
|
try: |
|
return _SCRIPT_CODES[normalized_name] |
|
except KeyError: |
|
if isinstance(default, type) and issubclass(default, KeyError): |
|
raise |
|
return default |
|
|
|
|
|
|
|
|
|
|
|
|
|
RTL_SCRIPTS = { |
|
|
|
"Arab", |
|
"Hebr", |
|
|
|
"Syrc", |
|
"Thaa", |
|
|
|
"Cprt", |
|
|
|
"Khar", |
|
|
|
"Phnx", |
|
"Nkoo", |
|
|
|
"Lydi", |
|
|
|
"Avst", |
|
"Armi", |
|
"Phli", |
|
"Prti", |
|
"Sarb", |
|
"Orkh", |
|
"Samr", |
|
|
|
"Mand", |
|
|
|
"Merc", |
|
"Mero", |
|
|
|
"Mani", |
|
"Mend", |
|
"Nbat", |
|
"Narb", |
|
"Palm", |
|
"Phlp", |
|
|
|
"Hatr", |
|
"Hung", |
|
|
|
"Adlm", |
|
|
|
"Rohg", |
|
"Sogo", |
|
"Sogd", |
|
|
|
"Elym", |
|
|
|
"Chrs", |
|
"Yezi", |
|
|
|
"Ougr", |
|
} |
|
|
|
|
|
HorizDirection = Literal["RTL", "LTR"] |
|
T = TypeVar("T") |
|
|
|
|
|
@overload |
|
def script_horizontal_direction(script_code: str, default: T) -> HorizDirection | T: ... |
|
|
|
|
|
@overload |
|
def script_horizontal_direction( |
|
script_code: str, default: type[KeyError] = KeyError |
|
) -> HorizDirection: ... |
|
|
|
|
|
def script_horizontal_direction( |
|
script_code: str, default: T | type[KeyError] = KeyError |
|
) -> HorizDirection | T: |
|
"""Return "RTL" for scripts that contain right-to-left characters |
|
according to the Bidi_Class property. Otherwise return "LTR". |
|
""" |
|
if script_code not in Scripts.NAMES: |
|
if isinstance(default, type) and issubclass(default, KeyError): |
|
raise default(script_code) |
|
return default |
|
return "RTL" if script_code in RTL_SCRIPTS else "LTR" |
|
|
|
|
|
def block(char): |
|
"""Return the block property assigned to the Unicode character 'char' |
|
as a string. |
|
|
|
>>> block("a") |
|
'Basic Latin' |
|
>>> block(chr(0x060C)) |
|
'Arabic' |
|
>>> block(chr(0xEFFFF)) |
|
'No_Block' |
|
""" |
|
code = byteord(char) |
|
i = bisect_right(Blocks.RANGES, code) |
|
return Blocks.VALUES[i - 1] |
|
|
|
|
|
def ot_tags_from_script(script_code): |
|
"""Return a list of OpenType script tags associated with a given |
|
Unicode script code. |
|
Return ['DFLT'] script tag for invalid/unknown script codes. |
|
""" |
|
if script_code in OTTags.SCRIPT_EXCEPTIONS: |
|
return [OTTags.SCRIPT_EXCEPTIONS[script_code]] |
|
|
|
if script_code not in Scripts.NAMES: |
|
return [OTTags.DEFAULT_SCRIPT] |
|
|
|
script_tags = [script_code[0].lower() + script_code[1:]] |
|
if script_code in OTTags.NEW_SCRIPT_TAGS: |
|
script_tags.extend(OTTags.NEW_SCRIPT_TAGS[script_code]) |
|
script_tags.reverse() |
|
|
|
return script_tags |
|
|
|
|
|
def ot_tag_to_script(tag): |
|
"""Return the Unicode script code for the given OpenType script tag, or |
|
None for "DFLT" tag or if there is no Unicode script associated with it. |
|
Raises ValueError if the tag is invalid. |
|
""" |
|
tag = tostr(tag).strip() |
|
if not tag or " " in tag or len(tag) > 4: |
|
raise ValueError("invalid OpenType tag: %r" % tag) |
|
|
|
if tag in OTTags.SCRIPT_ALIASES: |
|
tag = OTTags.SCRIPT_ALIASES[tag] |
|
|
|
while len(tag) != 4: |
|
tag += str(" ") |
|
|
|
if tag == OTTags.DEFAULT_SCRIPT: |
|
|
|
|
|
return None |
|
|
|
if tag in OTTags.NEW_SCRIPT_TAGS_REVERSED: |
|
return OTTags.NEW_SCRIPT_TAGS_REVERSED[tag] |
|
|
|
if tag in OTTags.SCRIPT_EXCEPTIONS_REVERSED: |
|
return OTTags.SCRIPT_EXCEPTIONS_REVERSED[tag] |
|
|
|
|
|
|
|
|
|
|
|
|
|
script_code = tag[0].upper() + tag[1] |
|
for i in range(2, 4): |
|
script_code += script_code[i - 1] if tag[i] == " " else tag[i] |
|
|
|
if script_code not in Scripts.NAMES: |
|
return None |
|
return script_code |
|
|