Spaces:

p3rc03
/

2B

Running

File size: 3,487 Bytes

a33458e

import os
import sys
from typing import List, Dict, Any
from langchain.document_loaders import (
    PyPDFLoader,
    TextLoader,
    CSVLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Add project root to path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from app.config import CHUNK_SIZE, CHUNK_OVERLAP
from app.core.memory import MemoryManager

class DocumentProcessor:
    """Processes documents for ingestion into the vector database."""
    
    def __init__(self, memory_manager: MemoryManager):
        self.memory_manager = memory_manager
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP
        )
    
    def process_file(self, file_path: str) -> List[str]:
        """Process a file and return a list of document chunks."""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
        
        # Get the file extension
        _, extension = os.path.splitext(file_path)
        extension = extension.lower()
        
        # Load the file using the appropriate loader
        if extension == '.pdf':
            loader = PyPDFLoader(file_path)
        elif extension == '.txt':
            loader = TextLoader(file_path)
        elif extension == '.csv':
            loader = CSVLoader(file_path)
        else:
            raise ValueError(f"Unsupported file type: {extension}")
        
        # Load and split the documents
        documents = loader.load()
        chunks = self.text_splitter.split_documents(documents)
        
        return chunks
    
    def ingest_file(self, file_path: str, metadata: Dict[str, Any] = None) -> List[str]:
        """Ingest a file into the vector database."""
        # Process the file
        chunks = self.process_file(file_path)
        
        # Add metadata to each chunk
        if metadata is None:
            metadata = {}
        
        # Add file path to metadata
        base_metadata = {
            "source": file_path,
            "file_name": os.path.basename(file_path)
        }
        base_metadata.update(metadata)
        
        # Prepare chunks and metadatas
        texts = [chunk.page_content for chunk in chunks]
        metadatas = []
        
        for i, chunk in enumerate(chunks):
            chunk_metadata = base_metadata.copy()
            if hasattr(chunk, 'metadata'):
                chunk_metadata.update(chunk.metadata)
            chunk_metadata["chunk_id"] = i
            metadatas.append(chunk_metadata)
        
        # Store in vector database
        ids = self.memory_manager.add_texts(texts, metadatas)
        
        return ids
    
    def ingest_text(self, text: str, metadata: Dict[str, Any] = None) -> List[str]:
        """Ingest raw text into the vector database."""
        if metadata is None:
            metadata = {}
        
        # Split the text
        chunks = self.text_splitter.split_text(text)
        
        # Prepare metadatas
        metadatas = []
        for i in range(len(chunks)):
            chunk_metadata = metadata.copy()
            chunk_metadata["chunk_id"] = i
            chunk_metadata["source"] = "direct_input"
            metadatas.append(chunk_metadata)
        
        # Store in vector database
        ids = self.memory_manager.add_texts(chunks, metadatas)
        
        return ids