from langchain.text_splitter import RecursiveCharacterTextSplitter | |
def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200): | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
documents = [] | |
for data in dataset: | |
text_list = data['documents'] | |
for text in text_list: | |
chunks = text_splitter.split_text(text) | |
for i, chunk in enumerate(chunks): | |
documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"}) | |
return documents |