File size: 714 Bytes
1ef9436 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
SYMBOL_SPLITS = {
"。",
"?",
"!",
"……",
".",
"?",
"!",
"~",
"…",
}
def make_text_chunk(original_text, strat_index, max_len=5, max_try=5000):
cut_string = original_text
end_index = strat_index
while True:
if original_text[end_index] in SYMBOL_SPLITS:
end_index += 1
cut_string = original_text[strat_index:end_index]
break
else:
end_index += 1
if end_index >= len(original_text):
# 文本太短,没找到
return 0, ""
if end_index > max_try:
# 有问题
raise ValueError("Reach max try")
return end_index, cut_string
|