|
|
|
from zhon.hanzi import punctuation as zh_punc |
|
|
|
def is_zh_char(uchar): |
|
""" |
|
https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48 |
|
re.compile("([\u4E00-\u9FD5]+)", re.U) |
|
""" |
|
return u'\u4e00' <= uchar <= u'\u9fa5' |
|
|
|
|
|
def has_zh(text): |
|
""" contains Chinese characters """ |
|
return any(is_zh_char(ch) for ch in text) |
|
|
|
|
|
def get_zh_count(text): |
|
return sum([is_zh_char(uchar) for uchar in text]) |
|
|
|
|
|
def is_all_zh(text): |
|
return all(is_zh_char(char) for char in text) |
|
|
|
|
|
def is_all_en(text): |
|
return text.encode('utf-8').isalpha() |
|
|
|
|
|
def is_digit_char(uchar): |
|
return uchar in "0123456789" |
|
|
|
|
|
def has_digit(text): |
|
return any(is_digit_char(ch) for ch in text) |
|
|
|
|
|
def is_all_digit(text): |
|
return all(is_digit_char(char) for char in text) |
|
|
|
|
|
def get_digit_count(text): |
|
digit_count = 0 |
|
for char in text: |
|
if char in "0123456789": |
|
digit_count += 1 |
|
return digit_count |
|
|
|
|
|
|
|
def has_zh_punc(text): |
|
""" |
|
是否包含中文标点 |
|
""" |
|
return any(ch in zh_punc for ch in text) |
|
|
|
|
|
|
|
def is_space_char(uchar): |
|
""" |
|
https://emptycharacter.com/ |
|
|
|
|
|
""" |
|
|
|
|
|
def has_space(text): |
|
pass |
|
|
|
def is_all_space(text): |
|
pass |
|
|
|
def get_space_count(text): |
|
space_count = 0 |
|
for char in text: |
|
if len(char.strip()) == 0: |
|
space_count += 1 |
|
return space_count |
|
|