realtime-rag-pipeline / retriever /chunk_documents.py
Gourisankar Padihary
Fix for other datasets
b58a992
raw
history blame
562 Bytes
from langchain.text_splitter import RecursiveCharacterTextSplitter
def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
documents = []
for data in dataset:
text_list = data['documents']
for text in text_list:
chunks = text_splitter.split_text(text)
for i, chunk in enumerate(chunks):
documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"})
return documents