File size: 562 Bytes
b58a992
026aeba
 
b58a992
026aeba
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = []
    for data in dataset:
        text_list = data['documents']
        for text in text_list:
            chunks = text_splitter.split_text(text)
            for i, chunk in enumerate(chunks):
                documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"})
    return documents