File size: 3,048 Bytes
816825a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import logging
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader
from langchain_core.documents import Document

from .config import Config

logger = logging.getLogger(__name__)

class DocumentProcessor:
    def __init__(self, chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None):
       config = Config.get_doc_processing_config()
       self.chunk_size = chunk_size  or config['chunk_size']
       self.chunk_overlap = chunk_overlap or config['chunk_overlap']
       self.text_splitter = RecursiveCharacterTextSplitter(
           chunk_size=self.chunk_size,
           chunk_overlap=self.chunk_overlap,
           length_function=len,
           separators=["\n\n", "\n", " ", ""]
         )

    def load_document(self, file_path: str,  encoding: Optional[str] = None) -> List[Document]:
        try:
            config = Config.get_doc_processing_config()
            encoding = encoding or config['encoding']
            logger.info(f"Loading document from {file_path}")
            loader = TextLoader(file_path, encoding=encoding)
            documents = loader.load()
            logger.info(f"Successfully loaded {len(documents)} document(s)")
            return documents
        
        except Exception as e:
            logger.error(f"Error loading document from {file_path}: {e}")
            raise e
        
    def chunk_documents(self, documents: List[Document]) -> List[Document]:
        try:
            logger.info(f"Chunking {len(documents)} document(s)")
            chunks = self.text_splitter.split_documents(documents)
            logger.info(f"Successfully created {len(chunks)} chunk(s)")
            return chunks
        
        except Exception as e:
            logger.error(f"Error chunking documents: {e}")
            raise e
        
    def process_document(self, file_path: str) -> List[Document]:
        try:
            documents = self.load_document(file_path)
            chunks = self.chunk_documents(documents)
            logger.info(f"Document processing completed: {len(chunks)} chunks created")
            return chunks
        
        except Exception as e:
            logger.error(f"Error processing document: {e}")
            raise e
        
    def get_document_stats(self, chunks: List[Document]) -> dict:
        if not chunks:
            return {
                'total_chunks': 0,
                'total_characters': 0,
                'avg_chunk_size': 0,
                'min_chunk_size': 0,
                'max_chunk_size': 0
            }
        
        chunk_sizes = [len(chunk.page_content) for chunk in chunks]
        total_chars = sum(chunk_sizes)

        return {
            'total_chunks': len(chunks),
            'total_characters': total_chars,
            'avg_chunk_size': total_chars / len(chunks),
            'min_chunk_size': min(chunk_sizes),
            'max_chunk_size': max(chunk_sizes)
        }