Spaces:
Sleeping
Sleeping
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import hashlib | |
def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200): | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
documents = [] | |
seen_hashes = set() # Track hashes of chunks to avoid duplicates | |
for data in dataset: | |
text_list = data['documents'] | |
for text in text_list: | |
chunks = text_splitter.split_text(text) | |
for i, chunk in enumerate(chunks): | |
# Generate a unique hash for the chunk | |
chunk_hash = hashlib.sha256(chunk.encode()).hexdigest() | |
# Skip if the chunk is a duplicate | |
if chunk_hash in seen_hashes: | |
continue | |
# Add the chunk to the documents list and track its hash | |
documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"}) | |
seen_hashes.add(chunk_hash) | |
return documents |