File size: 714 Bytes
1ef9436
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

SYMBOL_SPLITS = {
    "。",
    "?",
    "!",
    "……",
    ".",
    "?",
    "!",
    "~",
    "…",
}


def make_text_chunk(original_text, strat_index, max_len=5, max_try=5000):
    cut_string = original_text
    end_index = strat_index

    while True:
        if original_text[end_index] in SYMBOL_SPLITS:
            end_index += 1
            cut_string = original_text[strat_index:end_index]
            break
        else:
            end_index += 1

        if end_index >= len(original_text):
            # 文本太短,没找到
            return 0, ""

        if end_index > max_try:
            # 有问题
            raise ValueError("Reach max try")
    return end_index, cut_string