Create retriever/chunk_documents.py
Browse files- retriever/chunk_documents.py +49 -0
retriever/chunk_documents.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
+
import hashlib
|
| 4 |
+
|
| 5 |
+
def chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200):
|
| 6 |
+
"""
|
| 7 |
+
Chunk a list of page contents into smaller segments with document ID metadata.
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
page_list (list): List of strings, each string being the content of a page.
|
| 11 |
+
doc_id (str): Unique identifier for the document.
|
| 12 |
+
chunk_size (int): Maximum size of each chunk (default: 1000 characters).
|
| 13 |
+
chunk_overlap (int): Overlap between chunks (default: 200 characters).
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
list: List of dictionaries, each containing 'text', 'source', and 'doc_id'.
|
| 17 |
+
"""
|
| 18 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 19 |
+
documents = []
|
| 20 |
+
seen_hashes = set() # Track hashes of chunks to avoid duplicates
|
| 21 |
+
|
| 22 |
+
for page_num, page_content in enumerate(page_list, start=1): # Start page numbering at 1
|
| 23 |
+
if not page_content or not isinstance(page_content, str):
|
| 24 |
+
continue # Skip empty or invalid pages
|
| 25 |
+
|
| 26 |
+
# Split the page content into chunks
|
| 27 |
+
chunks = text_splitter.split_text(page_content)
|
| 28 |
+
|
| 29 |
+
for i, chunk in enumerate(chunks):
|
| 30 |
+
# Generate a unique hash for the chunk
|
| 31 |
+
chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
|
| 32 |
+
|
| 33 |
+
# Skip if the chunk is a duplicate
|
| 34 |
+
if chunk_hash in seen_hashes:
|
| 35 |
+
continue
|
| 36 |
+
|
| 37 |
+
# Create source identifier (e.g., "doc_123_page_1_chunk_0")
|
| 38 |
+
source = f"doc_{doc_id}_page_{page_num}_chunk_{i}"
|
| 39 |
+
|
| 40 |
+
# Add the chunk with doc_id as metadata
|
| 41 |
+
documents.append({
|
| 42 |
+
'text': chunk,
|
| 43 |
+
'source': source,
|
| 44 |
+
'doc_id': doc_id
|
| 45 |
+
})
|
| 46 |
+
seen_hashes.add(chunk_hash)
|
| 47 |
+
|
| 48 |
+
logging.info(f"Chunking of documents is done. Chunked the document to {len(documents)} numbers of chunks")
|
| 49 |
+
return documents
|