File size: 1,081 Bytes
826253d 026aeba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
import hashlib
def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
documents = []
seen_hashes = set() # Track hashes of chunks to avoid duplicates
for data in dataset:
text_list = data['documents']
for text in text_list:
chunks = text_splitter.split_text(text)
for i, chunk in enumerate(chunks):
# Generate a unique hash for the chunk
chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
# Skip if the chunk is a duplicate
if chunk_hash in seen_hashes:
continue
# Add the chunk to the documents list and track its hash
documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"})
seen_hashes.add(chunk_hash)
return documents |