File size: 562 Bytes
b58a992 026aeba b58a992 026aeba |
1 2 3 4 5 6 7 8 9 10 11 12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
documents = []
for data in dataset:
text_list = data['documents']
for text in text_list:
chunks = text_splitter.split_text(text)
for i, chunk in enumerate(chunks):
documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"})
return documents |