| from zhon.hanzi import punctuation as zh_punc | |
| def is_zh_char(uchar): | |
| """ | |
| https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48 | |
| re.compile("([\u4E00-\u9FD5]+)", re.U) | |
| """ | |
| return u'\u4e00' <= uchar <= u'\u9fa5' | |
| def has_zh(text): | |
| """ contains Chinese characters """ | |
| return any(is_zh_char(ch) for ch in text) | |
| def get_zh_count(text): | |
| return sum([is_zh_char(uchar) for uchar in text]) | |
| def is_all_zh(text): | |
| return all(is_zh_char(char) for char in text) | |
| def is_all_en(text): | |
| return text.encode('utf-8').isalpha() | |
| def is_digit_char(uchar): | |
| return uchar in "0123456789" | |
| def has_digit(text): | |
| return any(is_digit_char(ch) for ch in text) | |
| def is_all_digit(text): | |
| return all(is_digit_char(char) for char in text) | |
| def get_digit_count(text): | |
| digit_count = 0 | |
| for char in text: | |
| if char in "0123456789": | |
| digit_count += 1 | |
| return digit_count | |
| def has_zh_punc(text): | |
| """ | |
| 是否包含中文标点 | |
| """ | |
| return any(ch in zh_punc for ch in text) | |
| def is_space_char(uchar): | |
| """ | |
| https://emptycharacter.com/ | |
| """ | |
| def has_space(text): | |
| pass | |
| def is_all_space(text): | |
| pass | |
| def get_space_count(text): | |
| space_count = 0 | |
| for char in text: | |
| if len(char.strip()) == 0: | |
| space_count += 1 | |
| return space_count | |