Spaces:
Runtime error
Runtime error
import logging | |
from typing import List, Optional | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders.text import TextLoader | |
from langchain_core.documents import Document | |
from .config import Config | |
logger = logging.getLogger(__name__) | |
class DocumentProcessor: | |
def __init__(self, chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None): | |
config = Config.get_doc_processing_config() | |
self.chunk_size = chunk_size or config['chunk_size'] | |
self.chunk_overlap = chunk_overlap or config['chunk_overlap'] | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=self.chunk_size, | |
chunk_overlap=self.chunk_overlap, | |
length_function=len, | |
separators=["\n\n", "\n", " ", ""] | |
) | |
def load_document(self, file_path: str, encoding: Optional[str] = None) -> List[Document]: | |
try: | |
config = Config.get_doc_processing_config() | |
encoding = encoding or config['encoding'] | |
logger.info(f"Loading document from {file_path}") | |
loader = TextLoader(file_path, encoding=encoding) | |
documents = loader.load() | |
logger.info(f"Successfully loaded {len(documents)} document(s)") | |
return documents | |
except Exception as e: | |
logger.error(f"Error loading document from {file_path}: {e}") | |
raise e | |
def chunk_documents(self, documents: List[Document]) -> List[Document]: | |
try: | |
logger.info(f"Chunking {len(documents)} document(s)") | |
chunks = self.text_splitter.split_documents(documents) | |
logger.info(f"Successfully created {len(chunks)} chunk(s)") | |
return chunks | |
except Exception as e: | |
logger.error(f"Error chunking documents: {e}") | |
raise e | |
def process_document(self, file_path: str) -> List[Document]: | |
try: | |
documents = self.load_document(file_path) | |
chunks = self.chunk_documents(documents) | |
logger.info(f"Document processing completed: {len(chunks)} chunks created") | |
return chunks | |
except Exception as e: | |
logger.error(f"Error processing document: {e}") | |
raise e | |
def get_document_stats(self, chunks: List[Document]) -> dict: | |
if not chunks: | |
return { | |
'total_chunks': 0, | |
'total_characters': 0, | |
'avg_chunk_size': 0, | |
'min_chunk_size': 0, | |
'max_chunk_size': 0 | |
} | |
chunk_sizes = [len(chunk.page_content) for chunk in chunks] | |
total_chars = sum(chunk_sizes) | |
return { | |
'total_chunks': len(chunks), | |
'total_characters': total_chars, | |
'avg_chunk_size': total_chars / len(chunks), | |
'min_chunk_size': min(chunk_sizes), | |
'max_chunk_size': max(chunk_sizes) | |
} | |