Spaces:

hugging2021
/

rag-document-system

Runtime error

App Files Files Community

hugging2021 commited on 2 days ago

Commit

816825a

verified ·

1 Parent(s): 876494d

Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.dockerignore +19 -0
.env.example +5 -0
.gitignore +64 -0
Dockerfile +40 -0
LICENSE +21 -0
README.md +84 -10
app.py +259 -0
docker-compose.yaml +26 -0
requirements.txt +11 -0
src/config.py +97 -0
src/document_processor.py +82 -0
src/embedding_manager.py +66 -0
src/rag_pipeline.py +161 -0
src/ui_components.py +89 -0
src/vector_store.py +144 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,19 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+.git/
+.gitignore
+README.md
+.vscode/
+*.egg-info/
+dist/
+build/
+.pytest_cache/
+.coverage
+.tox/
+.cache
+chroma_db/
+documents/

.env.example ADDED Viewed

	@@ -0,0 +1,5 @@

+GOOGLE_API_KEY=your_google_api_key_here
+CHUNK_SIZE=1000
+CHUNK_OVERLAP=200
+EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+LLM_TEMPERATURE=0.3

.gitignore ADDED Viewed

	@@ -0,0 +1,64 @@

+# Environment variables (keep .env.example)
+.env
+!.env.example
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Testing
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Virtual environments
+venv/
+env/
+ENV/
+env.bak/
+venv.bak/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Application specific
+chroma_db/
+documents/
+*.log

Dockerfile ADDED Viewed

	@@ -0,0 +1,40 @@

+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements file first (for better caching)
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy source code
+COPY src/ ./src/
+COPY app.py .
+# Create directories for data persistence
+RUN mkdir -p /app/chroma_db /app/documents
+# Create a non-root user
+RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
+USER appuser
+# Expose Streamlit port
+EXPOSE 8501
+# Run the application
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 SneakyGraySnake
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,10 +1,84 @@
----
-title: Rag Document System
-emoji: 🌍
-colorFrom: red
-colorTo: green
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# RAG Document System
+A simple document interaction system using Retrieval-Augmented Generation (RAG) with Streamlit and Google's Gemini AI.
+## Features
+- Upload text documents (.txt files)
+- Ask questions about your documents
+- Get AI-powered answers with source citations
+- Persistent vector database storage
+- Clean web interface
+## Setup
+### With Docker (Recommended)
+1. Clone the repository:
+```bash
+git clone https://github.com/tusiim3/RAG-Document-System.git
+cd RAG-Document-System
+```
+2. Copy `.env.example` to `.env` and add your Google API key:
+```bash
+cp .env.example .env
+```
+3. Run with Docker Compose:
+```bash
+docker-compose up --build
+```
+4. Open http://localhost:8501 in your browser
+### Without Docker
+1. Clone the repository:
+```bash
+git clone https://github.com/tusiim3/RAG-Document-System.git
+cd RAG-Document-System
+```
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+3. Copy `.env.example` to `.env` and add your Google API key:
+```bash
+cp .env.example .env
+```
+4. Run the application:
+```bash
+streamlit run app.py
+```
+5. Open http://localhost:8501 in your browser
+## Environment Variables
+Required in `.env` file:
+- `GOOGLE_API_KEY` - Your Google API key for Gemini
+- `CHUNK_SIZE` - Text chunk size (default: 1000)
+- `CHUNK_OVERLAP` - Chunk overlap (default: 200)
+- `EMBEDDING_MODEL` - Embedding model name
+- `LLM_TEMPERATURE` - AI response temperature (default: 0.3)
+## Usage
+1. Upload a text document using the file uploader
+2. Wait for document processing to complete
+3. Ask questions about the document in the chat interface
+4. View source documents for each answer
+## Technology Stack
+- Streamlit for web interface
+- LangChain for document processing
+- ChromaDB for vector storage
+- Google Gemini for AI responses
+- Docker for containerization
+##

app.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import streamlit as st
+import os
+import tempfile
+import logging
+from dotenv import load_dotenv
+import uuid
+# UI Components moved to src/ui_components.py for easier debugging and maintenance
+from src.ui_components import (
+    setup_page_config, load_custom_css, render_header,
+    render_getting_started, render_system_info,
+    render_processing_spinner
+)
+from src.rag_pipeline import RAGPipeline
+load_dotenv()
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def initialize_session_state():
+    if 'session_id' not in st.session_state:
+        st.session_state.session_id = str(uuid.uuid4())
+    if 'rag_pipeline' not in st.session_state:
+        st.session_state.rag_pipeline = None
+    if 'messages' not in st.session_state:
+        st.session_state.messages = []
+    if 'rag_sources' not in st.session_state:
+        st.session_state.rag_sources = []
+    if 'document_loaded' not in st.session_state:
+        st.session_state.document_loaded = False
+    if 'document_stats' not in st.session_state:
+        st.session_state.document_stats = None
+def process_uploaded_document(uploaded_file):
+    try:
+        st.info(f"Starting to process: {uploaded_file.name}")
+        # Save uploaded file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.txt', mode='w', encoding='utf-8') as tmp_file:
+            content = uploaded_file.getvalue().decode('utf-8')
+            tmp_file.write(content)
+            tmp_file_path = tmp_file.name
+        st.info(f"File saved temporarily at: {tmp_file_path}")
+        st.info(f"File content length: {len(content)} characters")
+        # Initialize RAG pipeline if not already done
+        if st.session_state.rag_pipeline is None:
+            st.info("Initializing RAG pipeline...")
+            st.session_state.rag_pipeline = RAGPipeline()
+        # Process document
+        st.info("Processing document through RAG pipeline...")
+        success = st.session_state.rag_pipeline.process_document(tmp_file_path)
+        if success:
+            st.info("Document processed successfully, getting statistics...")
+            # Get document statistics
+            chunks = st.session_state.rag_pipeline.document_processor.process_document(tmp_file_path)
+            stats = st.session_state.rag_pipeline.document_processor.get_document_stats(chunks)
+            # Update session state
+            st.session_state.document_loaded = True
+            st.session_state.document_stats = stats
+            st.info(f"Document processed successfully: {stats['total_chunks']} chunks")
+        else:
+            st.error("Failed to process document")
+        # Clean up temporary file
+        os.unlink(tmp_file_path)
+        return success
+    except Exception as e:
+        st.error(f"Error processing uploaded document: {e}")
+        logger.error(f"Error processing uploaded document: {e}")
+        return False
+def handle_user_query(user_question):
+    try:
+        if not st.session_state.rag_pipeline or not st.session_state.document_loaded:
+            return "Please upload a document first before asking questions.", []
+        # Add user question to messages
+        st.session_state.messages.append({"role": "user", "content": user_question})
+        # Get response from RAG pipeline
+        with render_processing_spinner("Thinking..."):
+            answer, source_docs = st.session_state.rag_pipeline.query(user_question)
+         # Add assistant response to messages
+        st.session_state.messages.append({
+            "role": "assistant",
+            "content": answer,
+            "sources": source_docs
+        })
+        logger.info(f"Query processed: '{user_question[:50]}...'")
+        return answer, source_docs
+    except Exception as e:
+        logger.error(f"Error handling user query: {e}")
+        error_message = f"Error processing query: {str(e)}"
+        st.session_state.messages.append({"role": "assistant", "content": error_message, "sources": []})
+        return error_message, []
+def clear_all_documents():
+    st.session_state.rag_sources = []
+    st.session_state.document_loaded = False
+    st.session_state.document_stats = None
+    st.session_state.rag_pipeline = None
+    st.session_state.uploaded_files = []
+    # Clear the vector store as well
+    if st.session_state.rag_pipeline and st.session_state.rag_pipeline.vector_store_manager:
+        st.session_state.rag_pipeline.vector_store_manager.clear_vector_store()
+    # Increment uploader key to reset file uploader
+    if 'uploader_key' not in st.session_state:
+        st.session_state.uploader_key = 0
+    st.session_state.uploader_key += 1
+    st.rerun()
+def process_uploaded_files():
+    if 'uploaded_files' in st.session_state and st.session_state.uploaded_files:
+        for uploaded_file in st.session_state.uploaded_files:
+            if uploaded_file.name not in st.session_state.rag_sources:
+                # Simple test - just read the file content first
+                try:
+                    content = uploaded_file.getvalue().decode('utf-8')
+                    st.success(f"✅ {uploaded_file.name} uploaded successfully! Content length: {len(content)} characters")
+                    st.session_state.rag_sources.append(uploaded_file.name)
+                    # Set document_loaded to True when we have files
+                    st.session_state.document_loaded = True
+                    # Now try to process with RAG pipeline
+                    with st.spinner(f"Processing {uploaded_file.name} with RAG..."):
+                        success = process_uploaded_document(uploaded_file)
+                        if success:
+                            st.success(f"✅ {uploaded_file.name} RAG processing completed!")
+                        else:
+                            st.error(f"❌ RAG processing failed for {uploaded_file.name}")
+                except Exception as e:
+                    st.error(f"❌ Error reading {uploaded_file.name}: {e}")
+        # Clear the uploaded files from session state to prevent reprocessing
+        st.session_state.uploaded_files = []
+def main():
+    # Setup page configuration and styling
+    setup_page_config()
+    load_custom_css()
+    # Initialize session state
+    initialize_session_state()
+    # Render main header
+    render_header()
+    # Add getting started section
+    if not st.session_state.document_loaded:
+        render_getting_started()
+    # Clear buttons
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("Clear Chat", type="primary"):
+            st.session_state.messages.clear()
+            st.rerun()
+    with col2:
+        if st.button("Clear All Documents", type="secondary"):
+            clear_all_documents()
+    # Initialize uploader key
+    if 'uploader_key' not in st.session_state:
+        st.session_state.uploader_key = 0
+    # File upload input
+    uploaded_files = st.file_uploader(
+        "📄 Upload a text document (.txt only, max 200MB)",
+        type=["txt"],
+        accept_multiple_files=True,
+       key=f"rag_docs_{st.session_state.uploader_key}"
+    )
+    # Store uploaded files in session state and process them
+    if uploaded_files:
+        st.session_state.uploaded_files = uploaded_files
+        st.info(f"Files uploaded: {[f.name for f in uploaded_files]}")
+        process_uploaded_files()
+    # Show documents in DB with individual remove buttons
+    with st.expander(f"📚 Documents in DB ({len(st.session_state.rag_sources)})"):
+        if st.session_state.rag_sources:
+            for i, doc in enumerate(st.session_state.rag_sources):
+                col1, col2 = st.columns([3, 1])
+                with col1:
+                    st.write(f"• {doc}")
+                with col2:
+                    if st.button("🗑️", key=f"remove_doc_{i}_{doc}"):
+                        # Remove the document
+                        st.session_state.rag_sources.pop(i)
+                        # Reset document_loaded if no documents left
+                        if len(st.session_state.rag_sources) == 0:
+                            st.session_state.document_loaded = False
+                            st.session_state.document_stats = None
+                            st.session_state.rag_pipeline = None
+                        st.rerun()
+        else:
+            st.write("No documents in database")
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Chat input
+    if prompt := st.chat_input("Your message"):
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        with st.chat_message("assistant"):
+            message_placeholder = st.empty()
+            full_response = ""
+            # RAG response
+            answer, source_docs = handle_user_query(prompt)
+            st.write(answer)
+            # Show source documents if available
+            if source_docs and isinstance(source_docs, list) and len(source_docs) > 0:
+                with st.expander("📄 View Source Documents"):
+                    for i, doc in enumerate(source_docs[:3]):  # Show top 3 sources
+                        st.markdown(f"**Source {i+1}:**")
+                        st.markdown(f'{doc.page_content[:300]}{"..." if len(doc.page_content) > 300 else ""}')
+                        st.divider()
+    # System information
+    if st.session_state.rag_pipeline:
+        system_info = st.session_state.rag_pipeline.get_system_info()
+        render_system_info(system_info)
+if __name__ == "__main__":
+    main()

docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+services:
+  rag-app:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: rag-document-system
+    ports:
+      - "8501:8501"
+    environment:
+      - GOOGLE_API_KEY=${GOOGLE_API_KEY}
+      - CHUNK_SIZE=${CHUNK_SIZE:-1000}
+      - CHUNK_OVERLAP=${CHUNK_OVERLAP:-200}
+      - EMBEDDING_MODEL=${EMBEDDING_MODEL:-sentence-transformers/all-MiniLM-L6-v2}
+      - PERSIST_DIRECTORY=/app/chroma_db
+      - LLM_TEMPERATURE=${LLM_TEMPERATURE:-0.3}
+    volumes:
+      - ./chroma_db:/app/chroma_db
+      - ./documents:/app/documents
+    env_file:
+      - .env
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+streamlit>=1.28.0
+langchain>=0.1.0
+langchain-huggingface>=0.0.10
+langchain-community>=0.0.10
+langchain-core>=0.1.0
+langchain-chroma>=0.0.10
+langchain-google-genai>=0.0.6
+google-generativeai>=0.3.0
+chromadb>=0.4.0
+python-dotenv>=1.0.0
+typing-extensions>=4.5.0

src/config.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import sys
+from typing import Dict, Any
+class Config:
+    # Document Procesing
+    DEFAULT_CHUNK_SIZE = 1000
+    DEFAULT_CHUNK_OVERLAP = 200
+    DEFAULT_ENCODING = 'utf-8'
+    # Embedding Model
+    DEFAULT_EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
+    # Vector Store
+    DEFAULT_PERSIST_DIRECTORY = "./chroma_db"
+    DEFAULT_RETRIEVAL_K = 5
+    # LLM Settings
+    DEFAULT_TEMPERATURE = 0.3
+    DEFAULT_CHAIN_TYPE = "stuff"
+    #File Settings
+    SUPPORTED_FILE_TYPES = ["txt"]
+    MAX_FILE_SIZE_MB = 100
+    @classmethod
+    def get_doc_processing_config(cls) -> Dict[str, Any]:
+        return {
+            'chunk_size': int(os.getenv('CHUNK_SIZE', cls.DEFAULT_CHUNK_SIZE)),
+            'chunk_overlap': int(os.getenv('CHUNK_OVERLAP', cls.DEFAULT_CHUNK_OVERLAP)),
+            'encoding': os.getenv('ENCODING', cls.DEFAULT_ENCODING)
+        }
+    @classmethod
+    def get_embedding_config(cls) -> Dict[str, Any]:
+        return {
+            'model_name': os.getenv('EMBEDDING_MODEL', cls.DEFAULT_EMBEDDING_MODEL),
+        }
+    @classmethod
+    def get_vector_store_config(cls) -> Dict[str, Any]:
+        return {
+            'persist_directory': os.getenv('PERSIST_DIRECTORY', cls.DEFAULT_PERSIST_DIRECTORY),
+            'retrieval_k': int(os.getenv('RETRIEVAL_K', cls.DEFAULT_RETRIEVAL_K))
+        }
+    @classmethod
+    def get_llm_config(cls) -> Dict[str, Any]:
+        return {
+            'temperature': float(os.getenv('LLM_TEMPERATURE', cls.DEFAULT_TEMPERATURE)),
+            'chain_type': os.getenv('LLM_CHAIN_TYPE', cls.DEFAULT_CHAIN_TYPE),
+            'api_key': os.getenv('GOOGLE_API_KEY')
+        }
+    @classmethod
+    def get_file_settings(cls) -> Dict[str, Any]:
+        return {
+            'supported_types': cls.SUPPORTED_FILE_TYPES,
+            'max_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', cls.MAX_FILE_SIZE_MB))
+        }
+    @classmethod
+    def get_all_configs(cls) -> Dict[str, Any]:
+        return {
+            'document_processing': cls.get_doc_processing_config(),
+            'embedding': cls.get_embedding_config(),
+            'vector_store': cls.get_vector_store_config(),
+            'llm': cls.get_llm_config(),
+            'file_settings': cls.get_file_settings()
+        }
+    @classmethod
+    def validate_config(cls) -> bool:
+        llm_config = cls.get_llm_config()
+        if not llm_config['api_key']:
+            return False
+        return True
+    @classmethod
+    def get_environment_info(cls) -> Dict[str, Any]:
+        return {
+            'python_version': sys.version,
+            'environment_variables': {
+                'GOOGLE_API_KEY': 'SET' if os.getenv('GOOGLE_API_KEY') else 'NOT SET',
+                'CHUNK_SIZE': os.getenv('CHUNK_SIZE', 'DEFAULT'),
+                'EMBEDDING_MODEL': os.getenv('EMBEDDING_MODEL', 'DEFAULT'),
+                'PERSIST_DIRECTORY': os.getenv('PERSIST_DIRECTORY', 'DEFAULT'),
+            }
+        }

src/document_processor.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import logging
+from typing import List, Optional
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders.text import TextLoader
+from langchain_core.documents import Document
+from .config import Config
+logger = logging.getLogger(__name__)
+class DocumentProcessor:
+    def __init__(self, chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None):
+       config = Config.get_doc_processing_config()
+       self.chunk_size = chunk_size  or config['chunk_size']
+       self.chunk_overlap = chunk_overlap or config['chunk_overlap']
+       self.text_splitter = RecursiveCharacterTextSplitter(
+           chunk_size=self.chunk_size,
+           chunk_overlap=self.chunk_overlap,
+           length_function=len,
+           separators=["\n\n", "\n", " ", ""]
+         )
+    def load_document(self, file_path: str,  encoding: Optional[str] = None) -> List[Document]:
+        try:
+            config = Config.get_doc_processing_config()
+            encoding = encoding or config['encoding']
+            logger.info(f"Loading document from {file_path}")
+            loader = TextLoader(file_path, encoding=encoding)
+            documents = loader.load()
+            logger.info(f"Successfully loaded {len(documents)} document(s)")
+            return documents
+        except Exception as e:
+            logger.error(f"Error loading document from {file_path}: {e}")
+            raise e
+    def chunk_documents(self, documents: List[Document]) -> List[Document]:
+        try:
+            logger.info(f"Chunking {len(documents)} document(s)")
+            chunks = self.text_splitter.split_documents(documents)
+            logger.info(f"Successfully created {len(chunks)} chunk(s)")
+            return chunks
+        except Exception as e:
+            logger.error(f"Error chunking documents: {e}")
+            raise e
+    def process_document(self, file_path: str) -> List[Document]:
+        try:
+            documents = self.load_document(file_path)
+            chunks = self.chunk_documents(documents)
+            logger.info(f"Document processing completed: {len(chunks)} chunks created")
+            return chunks
+        except Exception as e:
+            logger.error(f"Error processing document: {e}")
+            raise e
+    def get_document_stats(self, chunks: List[Document]) -> dict:
+        if not chunks:
+            return {
+                'total_chunks': 0,
+                'total_characters': 0,
+                'avg_chunk_size': 0,
+                'min_chunk_size': 0,
+                'max_chunk_size': 0
+            }
+        chunk_sizes = [len(chunk.page_content) for chunk in chunks]
+        total_chars = sum(chunk_sizes)
+        return {
+            'total_chunks': len(chunks),
+            'total_characters': total_chars,
+            'avg_chunk_size': total_chars / len(chunks),
+            'min_chunk_size': min(chunk_sizes),
+            'max_chunk_size': max(chunk_sizes)
+        }

src/embedding_manager.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import logging
+from typing import List, Optional
+#from langchain_openai import OpenAIEmbeddings
+from langchain_core.documents import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+logger = logging.getLogger(__name__)
+class EmbeddingManager:
+    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"): # text-embedding-3-small
+        self.model_name = model_name
+        self.embeddings = None
+        self._initialize_embeddings()
+    def _initialize_embeddings(self):
+        try:
+            logger.info(f"Initializing embedding model: {self.model_name}")
+            self.embeddings = HuggingFaceEmbeddings(model=self.model_name, model_kwargs={'device': 'cpu'})
+            logger.info("Embedding model initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing embedding model: {e}")
+            raise e
+    def get_embeddings(self) -> HuggingFaceEmbeddings:
+        if self.embeddings is None:
+            self._initialize_embeddings()
+        return self.embeddings
+    def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
+        try:
+            logger.info(f"Generating embeddings for {len(texts)} text(s)")
+            embeddings = self.embeddings.embed_documents(texts)
+            logger.info(f"Successfully generated {len(embeddings)} embeddings")
+            return embeddings
+        except Exception as e:
+            logger.error(f"Error generating embeddings: {e}")
+            raise e
+    def generate_single_embedding(self, text: str) -> List[float]:
+        try:
+            embedding = self.embeddings.embed_query(text)
+            return embedding
+        except Exception as e:
+            logger.error(f"Error generating single embedding: {e}")
+            raise e
+    def get_embedding_dimension(self) -> int:
+        try:
+            test_embedding = self.generate_single_embedding("test")
+            return len(test_embedding)
+        except Exception as e:
+            logger.error(f"Error getting embedding dimension: {e}")
+            raise e
+    def get_model_info(self) -> dict:
+        return {
+            'model_name': self.model_name,
+            'dimension': self.get_embedding_dimension(),
+            'is_initialized': self.embeddings is not None
+        }

src/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import logging
+import os
+from typing import List, Optional, Tuple
+from dotenv import load_dotenv
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.chains import RetrievalQA
+from langchain_core.documents import Document
+import google.generativeai as genai
+from .document_processor import DocumentProcessor
+from .embedding_manager import EmbeddingManager
+from .vector_store import VectorStoreManager
+load_dotenv()
+logger = logging.getLogger(__name__)
+# Load API key from .env file
+google_api_key = os.environ.get("GOOGLE_API_KEY")
+if not google_api_key:
+    raise ValueError("GOOGLE_API_KEY not found in .env file")
+class RAGPipeline:
+    def __init__(self, api_key: Optional[str] = None, chunk_size: int = 1000, chunk_overlap: int = 200, embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", persist_directory: str = "./chroma_db", temperature: float = 0.3):
+        self.api_key = api_key
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.embedding_model = embedding_model
+        self.persist_directory = persist_directory
+        self.temperature = temperature
+        self.document_processor = None
+        self.embedding_manager = None
+        self.llm = None
+        self.qa_chain = None
+        self._initialize_components()
+    def _initialize_components(self):
+        try:
+            logger.info("Initializing RAG Pipeline components")
+            genai.configure(api_key=google_api_key)
+            self.document_processor = DocumentProcessor(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+            self.embedding_manager = EmbeddingManager(model_name=self.embedding_model)
+            self.vector_store_manager = VectorStoreManager(persist_directory=self.persist_directory, embedding_function=self.embedding_manager.get_embeddings())
+            self.vector_store_manager.initialize_vector_store()
+            self.llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=self.temperature)
+            logger.info("RAG Pipeline components initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing RAG Pipeline components: {e}")
+            raise e
+    def process_document(self, file_path: str) -> bool:
+        try:
+            logger.info(f"Processing document: {file_path}")
+            # Chunk document
+            chunks = self.document_processor.process_document(file_path)
+            if not chunks:
+                logger.error("No chunks generated from document")
+                return False
+            # Add chunks to vector store
+            success = self.vector_store_manager.add_documents(chunks)
+            if not success:
+                logger.error("Failed to add chunks to vector store")
+                return False
+            # Initialize QA chain
+            retriever = self.vector_store_manager.get_retriever()
+            self.qa_chain = RetrievalQA.from_chain_type(
+                llm=self.llm,
+                chain_type="stuff",
+                retriever=retriever,
+                return_source_documents=True
+            )
+            logger.info(f"Document processed successfully")
+            return True
+        except Exception as e:
+            logger.error(f"Error processing document: {e}")
+            return False
+    def query(self, question: str) -> Tuple[str, List[Document]]:
+        try:
+            if not self.qa_chain:
+                return "Please process a document first before asking questions.", []
+            logger.info(f"Processing query: '{question}'")
+            response = self.qa_chain({"query": question})
+            answer = response['result']
+            source_docs = response.get("source_documents", [])
+            logger.info(f"Query completed successfully. Answer length: {len(answer)}")
+            return answer, source_docs
+        except Exception as e:
+            logger.error(f"Error processing query: {e}")
+            return f"Error processing query: {str(e)}", []
+    def get_system_info(self) -> dict:
+        try:
+            info = {
+                'chunk_size': self.chunk_size,
+                'chunk_overlap': self.chunk_overlap,
+                'embedding_model': self.embedding_model,
+                'persist_directory': self.persist_directory,
+                'temperature': self.temperature,
+                'components_initialized': {
+                    'document_processor': self.document_processor is not None,
+                    'embedding_manager': self.embedding_manager is not None,
+                    'vector_store_manager': self.vector_store_manager is not None,
+                    'llm': self.llm is not None,
+                    'qa_chain': self.qa_chain is not None
+                }
+            }
+             # Add embedding model info
+            if self.embedding_manager:
+                info['embedding_info'] = self.embedding_manager.get_model_info()
+            # Add vector store stats
+            if self.vector_store_manager:
+                info['vector_store_stats'] = self.vector_store_manager.get_collection_stats()
+            return info
+        except Exception as e:
+            logger.error(f"Error getting system info: {e}")
+            return {}
+    def clear_knowledge_base(self) -> bool:
+         try:
+            logger.info("Clearing knowledge base")
+            # Clear vector store
+            if self.vector_store_manager:
+                self.vector_store_manager.clear_vector_store()
+            # Reset QA chain
+            self.qa_chain = None
+            logger.info("Knowledge base cleared successfully")
+            return True
+         except Exception as e:
+            logger.error(f"Error clearing knowledge base: {e}")
+            return False
+    def is_ready(self) -> bool:
+        return (
+            self.document_processor is not None and
+            self.embedding_manager is not None and
+            self.vector_store_manager is not None and
+            self.llm is not None and
+            self.qa_chain is not None
+        )

src/ui_components.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import streamlit as st
+def setup_page_config():
+    st.set_page_config(
+        page_title="RAG Document System",
+        page_icon="📚",
+        layout="centered"
+    )
+def load_custom_css():
+    st.markdown("""
+    <style>
+        .info-box {
+            background-color: #f0f2f6;
+            color: #000;
+            padding: 0.5rem;
+            margin: 0.5rem;
+            border-radius: 0.5rem;
+            border-left: 4px solid #1f77b4;
+        }
+    </style>
+    """, unsafe_allow_html=True)
+def render_header():
+    st.markdown('<h1 class="main-header">📚 RAG Document System</h1>', unsafe_allow_html=True)
+    st.markdown('<h2 class="sub-header">Upload and interact with your documents</h2>', unsafe_allow_html=True)
+def render_getting_started():
+    st.markdown("""
+    <div class="info-box">
+        <h4>Getting Started</h4>
+        <p>1. Upload a text document (.txt) using the file uploader above</p>
+        <p>2. Wait for the document to be processed</p>
+        <p>3. Start asking questions about your document!</p>
+    </div>
+    """, unsafe_allow_html=True)
+def render_system_info(system_info: dict):
+    """Render system information"""
+    with st.expander("🔧 System Information"):
+        if not system_info:
+            st.info("System information not available")
+            return
+        # Basic configuration
+        st.markdown("**Configuration:**")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.write(f"• Chunk Size: {system_info.get('chunk_size', 'N/A')}")
+            st.write(f"• Chunk Overlap: {system_info.get('chunk_overlap', 'N/A')}")
+            st.write(f"• Temperature: {system_info.get('temperature', 'N/A')}")
+        with col2:
+            st.write(f"• Embedding Model: {system_info.get('embedding_model', 'N/A')}")
+            st.write(f"• Persist Directory: {system_info.get('persist_directory', 'N/A')}")
+        # Component status
+        st.markdown("**Component Status:**")
+        components = system_info.get('components_initialized', {})
+        for component, status in components.items():
+            status_icon = "✅" if status else "❌"
+            st.write(f"{status_icon} {component.replace('_', ' ').title()}")
+        # Embedding info
+        if 'embedding_info' in system_info:
+            st.markdown("**Embedding Model Info:**")
+            embedding_info = system_info['embedding_info']
+            st.write(f"• Model: {embedding_info.get('model_name', 'N/A')}")
+            st.write(f"• Device: {embedding_info.get('device', 'N/A')}")
+            st.write(f"• Dimensions: {embedding_info.get('dimension', 'N/A')}")
+        # Vector store stats
+        if 'vector_store_stats' in system_info:
+            st.markdown("**Vector Store Stats:**")
+            vector_stats = system_info['vector_store_stats']
+            st.write(f"• Total Documents: {vector_stats.get('total_documents', 0)}")
+            st.write(f"• Collection: {vector_stats.get('collection_name', 'N/A')}")
+def render_processing_spinner(message: str = "Processing..."):
+    return st.spinner(message)

src/vector_store.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import logging
+import os
+from typing import List, Optional, Tuple
+from langchain_chroma import Chroma
+from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+logger = logging.getLogger(__name__)
+class VectorStoreManager:
+    def __init__(self, persist_directory: str = "./chroma_db", embedding_function: Optional[Embeddings] = None):
+        self.persist_directory = persist_directory
+        self.embedding_function = embedding_function
+        self.vector_store = None
+        self._ensure_persist_directory()
+    def _ensure_persist_directory(self):
+        try:
+            os.makedirs(self.persist_directory, exist_ok=True)
+            logger.info(f"Persist directory ensured: {self.persist_directory}")
+        except Exception as e:
+            logger.error(f"Error creating persist directory: {e}")
+            raise e
+    def initialize_vector_store(self, embedding_function: Optional[Embeddings] = None):
+        if embedding_function:
+            self.embedding_function = embedding_function
+        if not self.embedding_function:
+            raise ValueError("Embedding function must be provided")
+        try:
+            logger.info("Initializing vector store")
+            self.vector_store = Chroma(
+                persist_directory=self.persist_directory,
+                embedding_function=self.embedding_function
+            )
+            logger.info("Vector store initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing vector store: {e}")
+            raise e
+    def add_documents(self, documents: List[Document]) -> bool:
+        try:
+            if not self.vector_store:
+                raise ValueError("Vector store not initialized")
+            logger.info(f"Adding {len(documents)} document(s) to vector store")
+            self.vector_store.add_documents(documents)
+            logger.info("Documents added successfully")
+            return True
+        except Exception as e:
+            logger.error(f"Error adding documents to vector store: {e}")
+            return False
+    def similarity_search(self, query: str, k: int = 5) -> List[Document]:
+        try:
+            if not self.vector_store:
+                raise ValueError("Vector store not initialized")
+            logger.info(f"Performing similarity search for query: '{query[:50]}...'")
+            results = self.vector_store.similarity_search(query, k=k)
+            logger.info(f"Found {len(results)} similar documents")
+            return results
+        except Exception as e:
+            logger.error(f"Error performing similarity search: {e}")
+            return []
+    def similarity_search_with_score(self, query: str, k: int = 5) -> List[Tuple[Document, float]]:
+        try:
+            if not self.vector_store:
+                raise ValueError("Vector store not initialized")
+            logger.info(f"Performing similarity search with scores for query: '{query[:50]}...'")
+            results = self.vector_store.similarity_search_with_score(query, k=k)
+            logger.info(f"Found {len(results)} similar documents with scores")
+            return results
+        except Exception as e:
+            logger.error(f"Error performing similarity search with scores: {e}")
+            return []
+    def get_retriever(self, search_kwargs: Optional[dict] = None):
+        try:
+            if not self.vector_store:
+                raise ValueError("Vector store not initialized")
+            default_kwargs = {"k": 5}
+            if search_kwargs:
+                default_kwargs.update(search_kwargs)
+            retriever = self.vector_store.as_retriever(search_kwargs=default_kwargs)
+            logger.info("Retriever created successfully")
+            return retriever
+        except Exception as e:
+            logger.error(f"Error creating retriever: {e}")
+            raise e
+    def get_collection_stats(self) -> dict:
+        try:
+            if not self.vector_store:
+                return {'total_documents': 0, 'collection_name': None}
+            collection = self.vector_store._collection
+            count = collection.count()
+            return {
+                'total_documents': count,
+                'collection_name': collection.name,
+                'persist_directory': self.persist_directory
+            }
+        except Exception as e:
+            logger.error(f"Error getting collection stats: {e}")
+            return {'total_documents': 0, 'collection_name': None}
+    def clear_vector_store(self) -> bool:
+        try:
+            if not self.vector_store:
+                return True
+            logger.info("Clearing vector store")
+            self.vector_store._collection.delete(where={})
+            logger.info("Vector store cleared successfully")
+            return True
+        except Exception as e:
+            logger.error(f"Error clearing vector store: {e}")
+            return False
+    def is_initialized(self) -> bool:
+        return self.vector_store is not None