Spaces:
Runtime error
Runtime error
File size: 3,048 Bytes
816825a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import logging
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader
from langchain_core.documents import Document
from .config import Config
logger = logging.getLogger(__name__)
class DocumentProcessor:
def __init__(self, chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None):
config = Config.get_doc_processing_config()
self.chunk_size = chunk_size or config['chunk_size']
self.chunk_overlap = chunk_overlap or config['chunk_overlap']
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
def load_document(self, file_path: str, encoding: Optional[str] = None) -> List[Document]:
try:
config = Config.get_doc_processing_config()
encoding = encoding or config['encoding']
logger.info(f"Loading document from {file_path}")
loader = TextLoader(file_path, encoding=encoding)
documents = loader.load()
logger.info(f"Successfully loaded {len(documents)} document(s)")
return documents
except Exception as e:
logger.error(f"Error loading document from {file_path}: {e}")
raise e
def chunk_documents(self, documents: List[Document]) -> List[Document]:
try:
logger.info(f"Chunking {len(documents)} document(s)")
chunks = self.text_splitter.split_documents(documents)
logger.info(f"Successfully created {len(chunks)} chunk(s)")
return chunks
except Exception as e:
logger.error(f"Error chunking documents: {e}")
raise e
def process_document(self, file_path: str) -> List[Document]:
try:
documents = self.load_document(file_path)
chunks = self.chunk_documents(documents)
logger.info(f"Document processing completed: {len(chunks)} chunks created")
return chunks
except Exception as e:
logger.error(f"Error processing document: {e}")
raise e
def get_document_stats(self, chunks: List[Document]) -> dict:
if not chunks:
return {
'total_chunks': 0,
'total_characters': 0,
'avg_chunk_size': 0,
'min_chunk_size': 0,
'max_chunk_size': 0
}
chunk_sizes = [len(chunk.page_content) for chunk in chunks]
total_chars = sum(chunk_sizes)
return {
'total_chunks': len(chunks),
'total_characters': total_chars,
'avg_chunk_size': total_chars / len(chunks),
'min_chunk_size': min(chunk_sizes),
'max_chunk_size': max(chunk_sizes)
}
|