VocRT / providers /chunk_provider.py
Anurag
version-2 initial version
5306da4
import re
def chunk_text(text, max_chars=2040):
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence)
if current_length + sentence_length <= max_chars:
current_chunk.append(sentence)
current_length += sentence_length
else:
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_length = sentence_length
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks