Spaces:

geored
/

gtmio

Paused

App Files Files Community

gtmio / gtm /lib /python3.12 /site-packages /pyparsing /unicode.py

geored

Upload folder using huggingface_hub

fe41391 verified over 1 year ago

raw

history blame

10.5 kB

	# unicode.py

	import sys
	from itertools import filterfalse
	from typing import List, Tuple, Union


	class _lazyclassproperty:
	def __init__(self, fn):
	self.fn = fn
	self.__doc__ = fn.__doc__
	self.__name__ = fn.__name__

	def __get__(self, obj, cls):
	if cls is None:
	cls = type(obj)
	if not hasattr(cls, "_intern") or any(
	cls._intern is getattr(superclass, "_intern", [])
	for superclass in cls.__mro__[1:]
	):
	cls._intern = {}
	attrname = self.fn.__name__
	if attrname not in cls._intern:
	cls._intern[attrname] = self.fn(cls)
	return cls._intern[attrname]


	UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]


	class unicode_set:
	"""
	A set of Unicode characters, for language-specific strings for
	``alphas``, ``nums``, ``alphanums``, and ``printables``.
	A unicode_set is defined by a list of ranges in the Unicode character
	set, in a class attribute ``_ranges``. Ranges can be specified using
	2-tuples or a 1-tuple, such as::

	_ranges = [
	(0x0020, 0x007e),
	(0x00a0, 0x00ff),
	(0x0100,),
	]

	Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).

	A unicode set can also be defined using multiple inheritance of other unicode sets::

	class CJK(Chinese, Japanese, Korean):
	pass
	"""

	_ranges: UnicodeRangeList = []

	@_lazyclassproperty
	def _chars_for_ranges(cls):
	ret = []
	for cc in cls.__mro__:
	if cc is unicode_set:
	break
	for rr in getattr(cc, "_ranges", ()):
	ret.extend(range(rr[0], rr[-1] + 1))
	return [chr(c) for c in sorted(set(ret))]

	@_lazyclassproperty
	def printables(cls):
	"""all non-whitespace characters in this range"""
	return "".join(filterfalse(str.isspace, cls._chars_for_ranges))

	@_lazyclassproperty
	def alphas(cls):
	"""all alphabetic characters in this range"""
	return "".join(filter(str.isalpha, cls._chars_for_ranges))

	@_lazyclassproperty
	def nums(cls):
	"""all numeric digit characters in this range"""
	return "".join(filter(str.isdigit, cls._chars_for_ranges))

	@_lazyclassproperty
	def alphanums(cls):
	"""all alphanumeric characters in this range"""
	return cls.alphas + cls.nums

	@_lazyclassproperty
	def identchars(cls):
	"""all characters in this range that are valid identifier characters, plus underscore '_'"""
	return "".join(
	sorted(
	set(
	"".join(filter(str.isidentifier, cls._chars_for_ranges))
	+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
	+ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
	+ "_"
	)
	)
	)

	@_lazyclassproperty
	def identbodychars(cls):
	"""
	all characters in this range that are valid identifier body characters,
	plus the digits 0-9, and · (Unicode MIDDLE DOT)
	"""
	identifier_chars = set(
	c for c in cls._chars_for_ranges if ("_" + c).isidentifier()
	)
	return "".join(sorted(identifier_chars \| set(cls.identchars + "0123456789·")))

	@_lazyclassproperty
	def identifier(cls):
	"""
	a pyparsing Word expression for an identifier using this range's definitions for
	identchars and identbodychars
	"""
	from pyparsing import Word

	return Word(cls.identchars, cls.identbodychars)


	class pyparsing_unicode(unicode_set):
	"""
	A namespace class for defining common language unicode_sets.
	"""

	# fmt: off

	# define ranges in language character sets
	_ranges: UnicodeRangeList = [
	(0x0020, sys.maxunicode),
	]

	class BasicMultilingualPlane(unicode_set):
	"""Unicode set for the Basic Multilingual Plane"""
	_ranges: UnicodeRangeList = [
	(0x0020, 0xFFFF),
	]

	class Latin1(unicode_set):
	"""Unicode set for Latin-1 Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x0020, 0x007E),
	(0x00A0, 0x00FF),
	]

	class LatinA(unicode_set):
	"""Unicode set for Latin-A Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x0100, 0x017F),
	]

	class LatinB(unicode_set):
	"""Unicode set for Latin-B Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x0180, 0x024F),
	]

	class Greek(unicode_set):
	"""Unicode set for Greek Unicode Character Ranges"""
	_ranges: UnicodeRangeList = [
	(0x0342, 0x0345),
	(0x0370, 0x0377),
	(0x037A, 0x037F),
	(0x0384, 0x038A),
	(0x038C,),
	(0x038E, 0x03A1),
	(0x03A3, 0x03E1),
	(0x03F0, 0x03FF),
	(0x1D26, 0x1D2A),
	(0x1D5E,),
	(0x1D60,),
	(0x1D66, 0x1D6A),
	(0x1F00, 0x1F15),
	(0x1F18, 0x1F1D),
	(0x1F20, 0x1F45),
	(0x1F48, 0x1F4D),
	(0x1F50, 0x1F57),
	(0x1F59,),
	(0x1F5B,),
	(0x1F5D,),
	(0x1F5F, 0x1F7D),
	(0x1F80, 0x1FB4),
	(0x1FB6, 0x1FC4),
	(0x1FC6, 0x1FD3),
	(0x1FD6, 0x1FDB),
	(0x1FDD, 0x1FEF),
	(0x1FF2, 0x1FF4),
	(0x1FF6, 0x1FFE),
	(0x2129,),
	(0x2719, 0x271A),
	(0xAB65,),
	(0x10140, 0x1018D),
	(0x101A0,),
	(0x1D200, 0x1D245),
	(0x1F7A1, 0x1F7A7),
	]

	class Cyrillic(unicode_set):
	"""Unicode set for Cyrillic Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x0400, 0x052F),
	(0x1C80, 0x1C88),
	(0x1D2B,),
	(0x1D78,),
	(0x2DE0, 0x2DFF),
	(0xA640, 0xA672),
	(0xA674, 0xA69F),
	(0xFE2E, 0xFE2F),
	]

	class Chinese(unicode_set):
	"""Unicode set for Chinese Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x2E80, 0x2E99),
	(0x2E9B, 0x2EF3),
	(0x31C0, 0x31E3),
	(0x3400, 0x4DB5),
	(0x4E00, 0x9FEF),
	(0xA700, 0xA707),
	(0xF900, 0xFA6D),
	(0xFA70, 0xFAD9),
	(0x16FE2, 0x16FE3),
	(0x1F210, 0x1F212),
	(0x1F214, 0x1F23B),
	(0x1F240, 0x1F248),
	(0x20000, 0x2A6D6),
	(0x2A700, 0x2B734),
	(0x2B740, 0x2B81D),
	(0x2B820, 0x2CEA1),
	(0x2CEB0, 0x2EBE0),
	(0x2F800, 0x2FA1D),
	]

	class Japanese(unicode_set):
	"""Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"""

	class Kanji(unicode_set):
	"Unicode set for Kanji Unicode Character Range"
	_ranges: UnicodeRangeList = [
	(0x4E00, 0x9FBF),
	(0x3000, 0x303F),
	]

	class Hiragana(unicode_set):
	"""Unicode set for Hiragana Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x3041, 0x3096),
	(0x3099, 0x30A0),
	(0x30FC,),
	(0xFF70,),
	(0x1B001,),
	(0x1B150, 0x1B152),
	(0x1F200,),
	]

	class Katakana(unicode_set):
	"""Unicode set for Katakana Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x3099, 0x309C),
	(0x30A0, 0x30FF),
	(0x31F0, 0x31FF),
	(0x32D0, 0x32FE),
	(0xFF65, 0xFF9F),
	(0x1B000,),
	(0x1B164, 0x1B167),
	(0x1F201, 0x1F202),
	(0x1F213,),
	]

	漢字 = Kanji
	カタカナ = Katakana
	ひらがな = Hiragana

	_ranges = (
	Kanji._ranges
	+ Hiragana._ranges
	+ Katakana._ranges
	)

	class Hangul(unicode_set):
	"""Unicode set for Hangul (Korean) Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x1100, 0x11FF),
	(0x302E, 0x302F),
	(0x3131, 0x318E),
	(0x3200, 0x321C),
	(0x3260, 0x327B),
	(0x327E,),
	(0xA960, 0xA97C),
	(0xAC00, 0xD7A3),
	(0xD7B0, 0xD7C6),
	(0xD7CB, 0xD7FB),
	(0xFFA0, 0xFFBE),
	(0xFFC2, 0xFFC7),
	(0xFFCA, 0xFFCF),
	(0xFFD2, 0xFFD7),
	(0xFFDA, 0xFFDC),
	]

	Korean = Hangul

	class CJK(Chinese, Japanese, Hangul):
	"""Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"""

	class Thai(unicode_set):
	"""Unicode set for Thai Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x0E01, 0x0E3A),
	(0x0E3F, 0x0E5B)
	]

	class Arabic(unicode_set):
	"""Unicode set for Arabic Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x0600, 0x061B),
	(0x061E, 0x06FF),
	(0x0700, 0x077F),
	]

	class Hebrew(unicode_set):
	"""Unicode set for Hebrew Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x0591, 0x05C7),
	(0x05D0, 0x05EA),
	(0x05EF, 0x05F4),
	(0xFB1D, 0xFB36),
	(0xFB38, 0xFB3C),
	(0xFB3E,),
	(0xFB40, 0xFB41),
	(0xFB43, 0xFB44),
	(0xFB46, 0xFB4F),
	]

	class Devanagari(unicode_set):
	"""Unicode set for Devanagari Unicode Character Range"""
	_ranges: UnicodeRangeList = [
	(0x0900, 0x097F),
	(0xA8E0, 0xA8FF)
	]

	BMP = BasicMultilingualPlane

	# add language identifiers using language Unicode
	العربية = Arabic
	中文 = Chinese
	кириллица = Cyrillic
	Ελληνικά = Greek
	עִברִית = Hebrew
	日本語 = Japanese
	한국어 = Korean
	ไทย = Thai
	देवनागरी = Devanagari

	# fmt: on