Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .dockerignore +19 -0
- .env.example +5 -0
- .gitignore +64 -0
- Dockerfile +40 -0
- LICENSE +21 -0
- README.md +84 -10
- app.py +259 -0
- docker-compose.yaml +26 -0
- requirements.txt +11 -0
- src/config.py +97 -0
- src/document_processor.py +82 -0
- src/embedding_manager.py +66 -0
- src/rag_pipeline.py +161 -0
- src/ui_components.py +89 -0
- src/vector_store.py +144 -0
.dockerignore
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.pyc
|
3 |
+
*.pyo
|
4 |
+
*.pyd
|
5 |
+
.Python
|
6 |
+
*.so
|
7 |
+
.git/
|
8 |
+
.gitignore
|
9 |
+
README.md
|
10 |
+
.vscode/
|
11 |
+
*.egg-info/
|
12 |
+
dist/
|
13 |
+
build/
|
14 |
+
.pytest_cache/
|
15 |
+
.coverage
|
16 |
+
.tox/
|
17 |
+
.cache
|
18 |
+
chroma_db/
|
19 |
+
documents/
|
.env.example
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GOOGLE_API_KEY=your_google_api_key_here
|
2 |
+
CHUNK_SIZE=1000
|
3 |
+
CHUNK_OVERLAP=200
|
4 |
+
EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
5 |
+
LLM_TEMPERATURE=0.3
|
.gitignore
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Environment variables (keep .env.example)
|
2 |
+
.env
|
3 |
+
!.env.example
|
4 |
+
|
5 |
+
# Python
|
6 |
+
__pycache__/
|
7 |
+
*.py[cod]
|
8 |
+
*$py.class
|
9 |
+
*.so
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
*.egg-info/
|
24 |
+
.installed.cfg
|
25 |
+
*.egg
|
26 |
+
|
27 |
+
# Testing
|
28 |
+
.tox/
|
29 |
+
.coverage
|
30 |
+
.coverage.*
|
31 |
+
.cache
|
32 |
+
nosetests.xml
|
33 |
+
coverage.xml
|
34 |
+
*.cover
|
35 |
+
.hypothesis/
|
36 |
+
.pytest_cache/
|
37 |
+
|
38 |
+
# Virtual environments
|
39 |
+
venv/
|
40 |
+
env/
|
41 |
+
ENV/
|
42 |
+
env.bak/
|
43 |
+
venv.bak/
|
44 |
+
|
45 |
+
# IDE
|
46 |
+
.vscode/
|
47 |
+
.idea/
|
48 |
+
*.swp
|
49 |
+
*.swo
|
50 |
+
*~
|
51 |
+
|
52 |
+
# OS
|
53 |
+
.DS_Store
|
54 |
+
.DS_Store?
|
55 |
+
._*
|
56 |
+
.Spotlight-V100
|
57 |
+
.Trashes
|
58 |
+
ehthumbs.db
|
59 |
+
Thumbs.db
|
60 |
+
|
61 |
+
# Application specific
|
62 |
+
chroma_db/
|
63 |
+
documents/
|
64 |
+
*.log
|
Dockerfile
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11-slim
|
2 |
+
|
3 |
+
# Set working directory
|
4 |
+
WORKDIR /app
|
5 |
+
|
6 |
+
# Set environment variables
|
7 |
+
ENV PYTHONPATH=/app
|
8 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
9 |
+
ENV PYTHONUNBUFFERED=1
|
10 |
+
|
11 |
+
# Install system dependencies
|
12 |
+
RUN apt-get update && apt-get install -y \
|
13 |
+
build-essential \
|
14 |
+
curl \
|
15 |
+
software-properties-common \
|
16 |
+
git \
|
17 |
+
&& rm -rf /var/lib/apt/lists/*
|
18 |
+
|
19 |
+
# Copy requirements file first (for better caching)
|
20 |
+
COPY requirements.txt .
|
21 |
+
|
22 |
+
# Install Python dependencies
|
23 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
24 |
+
|
25 |
+
# Copy source code
|
26 |
+
COPY src/ ./src/
|
27 |
+
COPY app.py .
|
28 |
+
|
29 |
+
# Create directories for data persistence
|
30 |
+
RUN mkdir -p /app/chroma_db /app/documents
|
31 |
+
|
32 |
+
# Create a non-root user
|
33 |
+
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
|
34 |
+
USER appuser
|
35 |
+
|
36 |
+
# Expose Streamlit port
|
37 |
+
EXPOSE 8501
|
38 |
+
|
39 |
+
# Run the application
|
40 |
+
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 SneakyGraySnake
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,10 +1,84 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# RAG Document System
|
2 |
+
|
3 |
+
A simple document interaction system using Retrieval-Augmented Generation (RAG) with Streamlit and Google's Gemini AI.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- Upload text documents (.txt files)
|
8 |
+
- Ask questions about your documents
|
9 |
+
- Get AI-powered answers with source citations
|
10 |
+
- Persistent vector database storage
|
11 |
+
- Clean web interface
|
12 |
+
|
13 |
+
## Setup
|
14 |
+
|
15 |
+
### With Docker (Recommended)
|
16 |
+
|
17 |
+
1. Clone the repository:
|
18 |
+
```bash
|
19 |
+
git clone https://github.com/tusiim3/RAG-Document-System.git
|
20 |
+
cd RAG-Document-System
|
21 |
+
```
|
22 |
+
|
23 |
+
2. Copy `.env.example` to `.env` and add your Google API key:
|
24 |
+
```bash
|
25 |
+
cp .env.example .env
|
26 |
+
```
|
27 |
+
|
28 |
+
3. Run with Docker Compose:
|
29 |
+
```bash
|
30 |
+
docker-compose up --build
|
31 |
+
```
|
32 |
+
|
33 |
+
4. Open http://localhost:8501 in your browser
|
34 |
+
|
35 |
+
### Without Docker
|
36 |
+
|
37 |
+
1. Clone the repository:
|
38 |
+
```bash
|
39 |
+
git clone https://github.com/tusiim3/RAG-Document-System.git
|
40 |
+
cd RAG-Document-System
|
41 |
+
```
|
42 |
+
|
43 |
+
2. Install dependencies:
|
44 |
+
```bash
|
45 |
+
pip install -r requirements.txt
|
46 |
+
```
|
47 |
+
|
48 |
+
3. Copy `.env.example` to `.env` and add your Google API key:
|
49 |
+
```bash
|
50 |
+
cp .env.example .env
|
51 |
+
```
|
52 |
+
|
53 |
+
4. Run the application:
|
54 |
+
```bash
|
55 |
+
streamlit run app.py
|
56 |
+
```
|
57 |
+
|
58 |
+
5. Open http://localhost:8501 in your browser
|
59 |
+
|
60 |
+
## Environment Variables
|
61 |
+
|
62 |
+
Required in `.env` file:
|
63 |
+
- `GOOGLE_API_KEY` - Your Google API key for Gemini
|
64 |
+
- `CHUNK_SIZE` - Text chunk size (default: 1000)
|
65 |
+
- `CHUNK_OVERLAP` - Chunk overlap (default: 200)
|
66 |
+
- `EMBEDDING_MODEL` - Embedding model name
|
67 |
+
- `LLM_TEMPERATURE` - AI response temperature (default: 0.3)
|
68 |
+
|
69 |
+
## Usage
|
70 |
+
|
71 |
+
1. Upload a text document using the file uploader
|
72 |
+
2. Wait for document processing to complete
|
73 |
+
3. Ask questions about the document in the chat interface
|
74 |
+
4. View source documents for each answer
|
75 |
+
|
76 |
+
## Technology Stack
|
77 |
+
|
78 |
+
- Streamlit for web interface
|
79 |
+
- LangChain for document processing
|
80 |
+
- ChromaDB for vector storage
|
81 |
+
- Google Gemini for AI responses
|
82 |
+
- Docker for containerization
|
83 |
+
|
84 |
+
##
|
app.py
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import logging
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
import uuid
|
7 |
+
|
8 |
+
# UI Components moved to src/ui_components.py for easier debugging and maintenance
|
9 |
+
|
10 |
+
from src.ui_components import (
|
11 |
+
setup_page_config, load_custom_css, render_header,
|
12 |
+
render_getting_started, render_system_info,
|
13 |
+
render_processing_spinner
|
14 |
+
)
|
15 |
+
from src.rag_pipeline import RAGPipeline
|
16 |
+
|
17 |
+
load_dotenv()
|
18 |
+
|
19 |
+
# Configure logging
|
20 |
+
logging.basicConfig(level=logging.INFO)
|
21 |
+
logger = logging.getLogger(__name__)
|
22 |
+
|
23 |
+
def initialize_session_state():
|
24 |
+
if 'session_id' not in st.session_state:
|
25 |
+
st.session_state.session_id = str(uuid.uuid4())
|
26 |
+
|
27 |
+
if 'rag_pipeline' not in st.session_state:
|
28 |
+
st.session_state.rag_pipeline = None
|
29 |
+
|
30 |
+
if 'messages' not in st.session_state:
|
31 |
+
st.session_state.messages = []
|
32 |
+
|
33 |
+
if 'rag_sources' not in st.session_state:
|
34 |
+
st.session_state.rag_sources = []
|
35 |
+
|
36 |
+
if 'document_loaded' not in st.session_state:
|
37 |
+
st.session_state.document_loaded = False
|
38 |
+
|
39 |
+
if 'document_stats' not in st.session_state:
|
40 |
+
st.session_state.document_stats = None
|
41 |
+
|
42 |
+
def process_uploaded_document(uploaded_file):
|
43 |
+
try:
|
44 |
+
st.info(f"Starting to process: {uploaded_file.name}")
|
45 |
+
|
46 |
+
# Save uploaded file temporarily
|
47 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.txt', mode='w', encoding='utf-8') as tmp_file:
|
48 |
+
content = uploaded_file.getvalue().decode('utf-8')
|
49 |
+
tmp_file.write(content)
|
50 |
+
tmp_file_path = tmp_file.name
|
51 |
+
|
52 |
+
st.info(f"File saved temporarily at: {tmp_file_path}")
|
53 |
+
st.info(f"File content length: {len(content)} characters")
|
54 |
+
|
55 |
+
# Initialize RAG pipeline if not already done
|
56 |
+
if st.session_state.rag_pipeline is None:
|
57 |
+
st.info("Initializing RAG pipeline...")
|
58 |
+
st.session_state.rag_pipeline = RAGPipeline()
|
59 |
+
|
60 |
+
# Process document
|
61 |
+
st.info("Processing document through RAG pipeline...")
|
62 |
+
success = st.session_state.rag_pipeline.process_document(tmp_file_path)
|
63 |
+
|
64 |
+
if success:
|
65 |
+
st.info("Document processed successfully, getting statistics...")
|
66 |
+
# Get document statistics
|
67 |
+
chunks = st.session_state.rag_pipeline.document_processor.process_document(tmp_file_path)
|
68 |
+
stats = st.session_state.rag_pipeline.document_processor.get_document_stats(chunks)
|
69 |
+
|
70 |
+
# Update session state
|
71 |
+
st.session_state.document_loaded = True
|
72 |
+
st.session_state.document_stats = stats
|
73 |
+
|
74 |
+
st.info(f"Document processed successfully: {stats['total_chunks']} chunks")
|
75 |
+
else:
|
76 |
+
st.error("Failed to process document")
|
77 |
+
|
78 |
+
# Clean up temporary file
|
79 |
+
os.unlink(tmp_file_path)
|
80 |
+
|
81 |
+
return success
|
82 |
+
|
83 |
+
except Exception as e:
|
84 |
+
st.error(f"Error processing uploaded document: {e}")
|
85 |
+
logger.error(f"Error processing uploaded document: {e}")
|
86 |
+
return False
|
87 |
+
|
88 |
+
def handle_user_query(user_question):
|
89 |
+
try:
|
90 |
+
if not st.session_state.rag_pipeline or not st.session_state.document_loaded:
|
91 |
+
return "Please upload a document first before asking questions.", []
|
92 |
+
|
93 |
+
# Add user question to messages
|
94 |
+
st.session_state.messages.append({"role": "user", "content": user_question})
|
95 |
+
|
96 |
+
# Get response from RAG pipeline
|
97 |
+
with render_processing_spinner("Thinking..."):
|
98 |
+
answer, source_docs = st.session_state.rag_pipeline.query(user_question)
|
99 |
+
|
100 |
+
# Add assistant response to messages
|
101 |
+
st.session_state.messages.append({
|
102 |
+
"role": "assistant",
|
103 |
+
"content": answer,
|
104 |
+
"sources": source_docs
|
105 |
+
})
|
106 |
+
|
107 |
+
logger.info(f"Query processed: '{user_question[:50]}...'")
|
108 |
+
return answer, source_docs
|
109 |
+
|
110 |
+
except Exception as e:
|
111 |
+
logger.error(f"Error handling user query: {e}")
|
112 |
+
error_message = f"Error processing query: {str(e)}"
|
113 |
+
st.session_state.messages.append({"role": "assistant", "content": error_message, "sources": []})
|
114 |
+
return error_message, []
|
115 |
+
|
116 |
+
def clear_all_documents():
|
117 |
+
st.session_state.rag_sources = []
|
118 |
+
st.session_state.document_loaded = False
|
119 |
+
st.session_state.document_stats = None
|
120 |
+
st.session_state.rag_pipeline = None
|
121 |
+
st.session_state.uploaded_files = []
|
122 |
+
|
123 |
+
# Clear the vector store as well
|
124 |
+
if st.session_state.rag_pipeline and st.session_state.rag_pipeline.vector_store_manager:
|
125 |
+
st.session_state.rag_pipeline.vector_store_manager.clear_vector_store()
|
126 |
+
|
127 |
+
# Increment uploader key to reset file uploader
|
128 |
+
if 'uploader_key' not in st.session_state:
|
129 |
+
st.session_state.uploader_key = 0
|
130 |
+
st.session_state.uploader_key += 1
|
131 |
+
st.rerun()
|
132 |
+
|
133 |
+
def process_uploaded_files():
|
134 |
+
if 'uploaded_files' in st.session_state and st.session_state.uploaded_files:
|
135 |
+
for uploaded_file in st.session_state.uploaded_files:
|
136 |
+
if uploaded_file.name not in st.session_state.rag_sources:
|
137 |
+
# Simple test - just read the file content first
|
138 |
+
try:
|
139 |
+
content = uploaded_file.getvalue().decode('utf-8')
|
140 |
+
st.success(f"✅ {uploaded_file.name} uploaded successfully! Content length: {len(content)} characters")
|
141 |
+
st.session_state.rag_sources.append(uploaded_file.name)
|
142 |
+
|
143 |
+
# Set document_loaded to True when we have files
|
144 |
+
st.session_state.document_loaded = True
|
145 |
+
|
146 |
+
# Now try to process with RAG pipeline
|
147 |
+
with st.spinner(f"Processing {uploaded_file.name} with RAG..."):
|
148 |
+
success = process_uploaded_document(uploaded_file)
|
149 |
+
if success:
|
150 |
+
st.success(f"✅ {uploaded_file.name} RAG processing completed!")
|
151 |
+
else:
|
152 |
+
st.error(f"❌ RAG processing failed for {uploaded_file.name}")
|
153 |
+
|
154 |
+
except Exception as e:
|
155 |
+
st.error(f"❌ Error reading {uploaded_file.name}: {e}")
|
156 |
+
|
157 |
+
# Clear the uploaded files from session state to prevent reprocessing
|
158 |
+
st.session_state.uploaded_files = []
|
159 |
+
|
160 |
+
def main():
|
161 |
+
# Setup page configuration and styling
|
162 |
+
setup_page_config()
|
163 |
+
load_custom_css()
|
164 |
+
|
165 |
+
# Initialize session state
|
166 |
+
initialize_session_state()
|
167 |
+
|
168 |
+
# Render main header
|
169 |
+
render_header()
|
170 |
+
|
171 |
+
# Add getting started section
|
172 |
+
if not st.session_state.document_loaded:
|
173 |
+
render_getting_started()
|
174 |
+
|
175 |
+
# Clear buttons
|
176 |
+
col1, col2 = st.columns(2)
|
177 |
+
with col1:
|
178 |
+
if st.button("Clear Chat", type="primary"):
|
179 |
+
st.session_state.messages.clear()
|
180 |
+
st.rerun()
|
181 |
+
with col2:
|
182 |
+
if st.button("Clear All Documents", type="secondary"):
|
183 |
+
clear_all_documents()
|
184 |
+
|
185 |
+
# Initialize uploader key
|
186 |
+
if 'uploader_key' not in st.session_state:
|
187 |
+
st.session_state.uploader_key = 0
|
188 |
+
|
189 |
+
# File upload input
|
190 |
+
uploaded_files = st.file_uploader(
|
191 |
+
"📄 Upload a text document (.txt only, max 200MB)",
|
192 |
+
type=["txt"],
|
193 |
+
accept_multiple_files=True,
|
194 |
+
key=f"rag_docs_{st.session_state.uploader_key}"
|
195 |
+
)
|
196 |
+
|
197 |
+
# Store uploaded files in session state and process them
|
198 |
+
if uploaded_files:
|
199 |
+
st.session_state.uploaded_files = uploaded_files
|
200 |
+
st.info(f"Files uploaded: {[f.name for f in uploaded_files]}")
|
201 |
+
process_uploaded_files()
|
202 |
+
|
203 |
+
# Show documents in DB with individual remove buttons
|
204 |
+
with st.expander(f"📚 Documents in DB ({len(st.session_state.rag_sources)})"):
|
205 |
+
if st.session_state.rag_sources:
|
206 |
+
for i, doc in enumerate(st.session_state.rag_sources):
|
207 |
+
col1, col2 = st.columns([3, 1])
|
208 |
+
with col1:
|
209 |
+
st.write(f"• {doc}")
|
210 |
+
with col2:
|
211 |
+
if st.button("🗑️", key=f"remove_doc_{i}_{doc}"):
|
212 |
+
# Remove the document
|
213 |
+
st.session_state.rag_sources.pop(i)
|
214 |
+
# Reset document_loaded if no documents left
|
215 |
+
if len(st.session_state.rag_sources) == 0:
|
216 |
+
st.session_state.document_loaded = False
|
217 |
+
st.session_state.document_stats = None
|
218 |
+
st.session_state.rag_pipeline = None
|
219 |
+
st.rerun()
|
220 |
+
else:
|
221 |
+
st.write("No documents in database")
|
222 |
+
|
223 |
+
|
224 |
+
|
225 |
+
# Display chat history
|
226 |
+
for message in st.session_state.messages:
|
227 |
+
with st.chat_message(message["role"]):
|
228 |
+
st.markdown(message["content"])
|
229 |
+
|
230 |
+
# Chat input
|
231 |
+
if prompt := st.chat_input("Your message"):
|
232 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
233 |
+
with st.chat_message("user"):
|
234 |
+
st.markdown(prompt)
|
235 |
+
|
236 |
+
with st.chat_message("assistant"):
|
237 |
+
message_placeholder = st.empty()
|
238 |
+
full_response = ""
|
239 |
+
|
240 |
+
# RAG response
|
241 |
+
answer, source_docs = handle_user_query(prompt)
|
242 |
+
st.write(answer)
|
243 |
+
|
244 |
+
# Show source documents if available
|
245 |
+
if source_docs and isinstance(source_docs, list) and len(source_docs) > 0:
|
246 |
+
with st.expander("📄 View Source Documents"):
|
247 |
+
for i, doc in enumerate(source_docs[:3]): # Show top 3 sources
|
248 |
+
st.markdown(f"**Source {i+1}:**")
|
249 |
+
st.markdown(f'{doc.page_content[:300]}{"..." if len(doc.page_content) > 300 else ""}')
|
250 |
+
st.divider()
|
251 |
+
|
252 |
+
# System information
|
253 |
+
if st.session_state.rag_pipeline:
|
254 |
+
system_info = st.session_state.rag_pipeline.get_system_info()
|
255 |
+
render_system_info(system_info)
|
256 |
+
|
257 |
+
|
258 |
+
if __name__ == "__main__":
|
259 |
+
main()
|
docker-compose.yaml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
rag-app:
|
3 |
+
build:
|
4 |
+
context: .
|
5 |
+
dockerfile: Dockerfile
|
6 |
+
container_name: rag-document-system
|
7 |
+
ports:
|
8 |
+
- "8501:8501"
|
9 |
+
environment:
|
10 |
+
- GOOGLE_API_KEY=${GOOGLE_API_KEY}
|
11 |
+
- CHUNK_SIZE=${CHUNK_SIZE:-1000}
|
12 |
+
- CHUNK_OVERLAP=${CHUNK_OVERLAP:-200}
|
13 |
+
- EMBEDDING_MODEL=${EMBEDDING_MODEL:-sentence-transformers/all-MiniLM-L6-v2}
|
14 |
+
- PERSIST_DIRECTORY=/app/chroma_db
|
15 |
+
- LLM_TEMPERATURE=${LLM_TEMPERATURE:-0.3}
|
16 |
+
volumes:
|
17 |
+
- ./chroma_db:/app/chroma_db
|
18 |
+
- ./documents:/app/documents
|
19 |
+
env_file:
|
20 |
+
- .env
|
21 |
+
restart: unless-stopped
|
22 |
+
healthcheck:
|
23 |
+
test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
|
24 |
+
interval: 30s
|
25 |
+
timeout: 10s
|
26 |
+
retries: 3
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit>=1.28.0
|
2 |
+
langchain>=0.1.0
|
3 |
+
langchain-huggingface>=0.0.10
|
4 |
+
langchain-community>=0.0.10
|
5 |
+
langchain-core>=0.1.0
|
6 |
+
langchain-chroma>=0.0.10
|
7 |
+
langchain-google-genai>=0.0.6
|
8 |
+
google-generativeai>=0.3.0
|
9 |
+
chromadb>=0.4.0
|
10 |
+
python-dotenv>=1.0.0
|
11 |
+
typing-extensions>=4.5.0
|
src/config.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from typing import Dict, Any
|
4 |
+
|
5 |
+
class Config:
|
6 |
+
|
7 |
+
# Document Procesing
|
8 |
+
DEFAULT_CHUNK_SIZE = 1000
|
9 |
+
DEFAULT_CHUNK_OVERLAP = 200
|
10 |
+
DEFAULT_ENCODING = 'utf-8'
|
11 |
+
|
12 |
+
# Embedding Model
|
13 |
+
DEFAULT_EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
|
14 |
+
|
15 |
+
# Vector Store
|
16 |
+
DEFAULT_PERSIST_DIRECTORY = "./chroma_db"
|
17 |
+
DEFAULT_RETRIEVAL_K = 5
|
18 |
+
|
19 |
+
# LLM Settings
|
20 |
+
DEFAULT_TEMPERATURE = 0.3
|
21 |
+
DEFAULT_CHAIN_TYPE = "stuff"
|
22 |
+
|
23 |
+
#File Settings
|
24 |
+
SUPPORTED_FILE_TYPES = ["txt"]
|
25 |
+
MAX_FILE_SIZE_MB = 100
|
26 |
+
|
27 |
+
@classmethod
|
28 |
+
def get_doc_processing_config(cls) -> Dict[str, Any]:
|
29 |
+
return {
|
30 |
+
'chunk_size': int(os.getenv('CHUNK_SIZE', cls.DEFAULT_CHUNK_SIZE)),
|
31 |
+
'chunk_overlap': int(os.getenv('CHUNK_OVERLAP', cls.DEFAULT_CHUNK_OVERLAP)),
|
32 |
+
'encoding': os.getenv('ENCODING', cls.DEFAULT_ENCODING)
|
33 |
+
}
|
34 |
+
|
35 |
+
@classmethod
|
36 |
+
def get_embedding_config(cls) -> Dict[str, Any]:
|
37 |
+
return {
|
38 |
+
'model_name': os.getenv('EMBEDDING_MODEL', cls.DEFAULT_EMBEDDING_MODEL),
|
39 |
+
}
|
40 |
+
|
41 |
+
@classmethod
|
42 |
+
def get_vector_store_config(cls) -> Dict[str, Any]:
|
43 |
+
return {
|
44 |
+
'persist_directory': os.getenv('PERSIST_DIRECTORY', cls.DEFAULT_PERSIST_DIRECTORY),
|
45 |
+
'retrieval_k': int(os.getenv('RETRIEVAL_K', cls.DEFAULT_RETRIEVAL_K))
|
46 |
+
}
|
47 |
+
|
48 |
+
@classmethod
|
49 |
+
def get_llm_config(cls) -> Dict[str, Any]:
|
50 |
+
return {
|
51 |
+
'temperature': float(os.getenv('LLM_TEMPERATURE', cls.DEFAULT_TEMPERATURE)),
|
52 |
+
'chain_type': os.getenv('LLM_CHAIN_TYPE', cls.DEFAULT_CHAIN_TYPE),
|
53 |
+
'api_key': os.getenv('GOOGLE_API_KEY')
|
54 |
+
}
|
55 |
+
|
56 |
+
@classmethod
|
57 |
+
def get_file_settings(cls) -> Dict[str, Any]:
|
58 |
+
return {
|
59 |
+
'supported_types': cls.SUPPORTED_FILE_TYPES,
|
60 |
+
'max_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', cls.MAX_FILE_SIZE_MB))
|
61 |
+
}
|
62 |
+
|
63 |
+
@classmethod
|
64 |
+
def get_all_configs(cls) -> Dict[str, Any]:
|
65 |
+
return {
|
66 |
+
'document_processing': cls.get_doc_processing_config(),
|
67 |
+
'embedding': cls.get_embedding_config(),
|
68 |
+
'vector_store': cls.get_vector_store_config(),
|
69 |
+
'llm': cls.get_llm_config(),
|
70 |
+
'file_settings': cls.get_file_settings()
|
71 |
+
}
|
72 |
+
|
73 |
+
@classmethod
|
74 |
+
def validate_config(cls) -> bool:
|
75 |
+
llm_config = cls.get_llm_config()
|
76 |
+
|
77 |
+
if not llm_config['api_key']:
|
78 |
+
return False
|
79 |
+
|
80 |
+
return True
|
81 |
+
|
82 |
+
@classmethod
|
83 |
+
def get_environment_info(cls) -> Dict[str, Any]:
|
84 |
+
return {
|
85 |
+
'python_version': sys.version,
|
86 |
+
'environment_variables': {
|
87 |
+
'GOOGLE_API_KEY': 'SET' if os.getenv('GOOGLE_API_KEY') else 'NOT SET',
|
88 |
+
'CHUNK_SIZE': os.getenv('CHUNK_SIZE', 'DEFAULT'),
|
89 |
+
'EMBEDDING_MODEL': os.getenv('EMBEDDING_MODEL', 'DEFAULT'),
|
90 |
+
'PERSIST_DIRECTORY': os.getenv('PERSIST_DIRECTORY', 'DEFAULT'),
|
91 |
+
}
|
92 |
+
}
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
|
src/document_processor.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from typing import List, Optional
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from langchain_community.document_loaders.text import TextLoader
|
5 |
+
from langchain_core.documents import Document
|
6 |
+
|
7 |
+
from .config import Config
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
class DocumentProcessor:
|
12 |
+
def __init__(self, chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None):
|
13 |
+
config = Config.get_doc_processing_config()
|
14 |
+
self.chunk_size = chunk_size or config['chunk_size']
|
15 |
+
self.chunk_overlap = chunk_overlap or config['chunk_overlap']
|
16 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
17 |
+
chunk_size=self.chunk_size,
|
18 |
+
chunk_overlap=self.chunk_overlap,
|
19 |
+
length_function=len,
|
20 |
+
separators=["\n\n", "\n", " ", ""]
|
21 |
+
)
|
22 |
+
|
23 |
+
def load_document(self, file_path: str, encoding: Optional[str] = None) -> List[Document]:
|
24 |
+
try:
|
25 |
+
config = Config.get_doc_processing_config()
|
26 |
+
encoding = encoding or config['encoding']
|
27 |
+
logger.info(f"Loading document from {file_path}")
|
28 |
+
loader = TextLoader(file_path, encoding=encoding)
|
29 |
+
documents = loader.load()
|
30 |
+
logger.info(f"Successfully loaded {len(documents)} document(s)")
|
31 |
+
return documents
|
32 |
+
|
33 |
+
except Exception as e:
|
34 |
+
logger.error(f"Error loading document from {file_path}: {e}")
|
35 |
+
raise e
|
36 |
+
|
37 |
+
def chunk_documents(self, documents: List[Document]) -> List[Document]:
|
38 |
+
try:
|
39 |
+
logger.info(f"Chunking {len(documents)} document(s)")
|
40 |
+
chunks = self.text_splitter.split_documents(documents)
|
41 |
+
logger.info(f"Successfully created {len(chunks)} chunk(s)")
|
42 |
+
return chunks
|
43 |
+
|
44 |
+
except Exception as e:
|
45 |
+
logger.error(f"Error chunking documents: {e}")
|
46 |
+
raise e
|
47 |
+
|
48 |
+
def process_document(self, file_path: str) -> List[Document]:
|
49 |
+
try:
|
50 |
+
documents = self.load_document(file_path)
|
51 |
+
chunks = self.chunk_documents(documents)
|
52 |
+
logger.info(f"Document processing completed: {len(chunks)} chunks created")
|
53 |
+
return chunks
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
logger.error(f"Error processing document: {e}")
|
57 |
+
raise e
|
58 |
+
|
59 |
+
def get_document_stats(self, chunks: List[Document]) -> dict:
|
60 |
+
if not chunks:
|
61 |
+
return {
|
62 |
+
'total_chunks': 0,
|
63 |
+
'total_characters': 0,
|
64 |
+
'avg_chunk_size': 0,
|
65 |
+
'min_chunk_size': 0,
|
66 |
+
'max_chunk_size': 0
|
67 |
+
}
|
68 |
+
|
69 |
+
chunk_sizes = [len(chunk.page_content) for chunk in chunks]
|
70 |
+
total_chars = sum(chunk_sizes)
|
71 |
+
|
72 |
+
return {
|
73 |
+
'total_chunks': len(chunks),
|
74 |
+
'total_characters': total_chars,
|
75 |
+
'avg_chunk_size': total_chars / len(chunks),
|
76 |
+
'min_chunk_size': min(chunk_sizes),
|
77 |
+
'max_chunk_size': max(chunk_sizes)
|
78 |
+
}
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
|
src/embedding_manager.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from typing import List, Optional
|
3 |
+
#from langchain_openai import OpenAIEmbeddings
|
4 |
+
from langchain_core.documents import Document
|
5 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
6 |
+
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
|
9 |
+
class EmbeddingManager:
|
10 |
+
def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"): # text-embedding-3-small
|
11 |
+
self.model_name = model_name
|
12 |
+
self.embeddings = None
|
13 |
+
self._initialize_embeddings()
|
14 |
+
|
15 |
+
def _initialize_embeddings(self):
|
16 |
+
try:
|
17 |
+
logger.info(f"Initializing embedding model: {self.model_name}")
|
18 |
+
self.embeddings = HuggingFaceEmbeddings(model=self.model_name, model_kwargs={'device': 'cpu'})
|
19 |
+
logger.info("Embedding model initialized successfully")
|
20 |
+
|
21 |
+
except Exception as e:
|
22 |
+
logger.error(f"Error initializing embedding model: {e}")
|
23 |
+
raise e
|
24 |
+
|
25 |
+
def get_embeddings(self) -> HuggingFaceEmbeddings:
|
26 |
+
if self.embeddings is None:
|
27 |
+
self._initialize_embeddings()
|
28 |
+
return self.embeddings
|
29 |
+
|
30 |
+
def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
31 |
+
try:
|
32 |
+
logger.info(f"Generating embeddings for {len(texts)} text(s)")
|
33 |
+
embeddings = self.embeddings.embed_documents(texts)
|
34 |
+
logger.info(f"Successfully generated {len(embeddings)} embeddings")
|
35 |
+
return embeddings
|
36 |
+
|
37 |
+
except Exception as e:
|
38 |
+
logger.error(f"Error generating embeddings: {e}")
|
39 |
+
raise e
|
40 |
+
|
41 |
+
def generate_single_embedding(self, text: str) -> List[float]:
|
42 |
+
try:
|
43 |
+
embedding = self.embeddings.embed_query(text)
|
44 |
+
return embedding
|
45 |
+
|
46 |
+
except Exception as e:
|
47 |
+
logger.error(f"Error generating single embedding: {e}")
|
48 |
+
raise e
|
49 |
+
|
50 |
+
def get_embedding_dimension(self) -> int:
|
51 |
+
try:
|
52 |
+
test_embedding = self.generate_single_embedding("test")
|
53 |
+
return len(test_embedding)
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
logger.error(f"Error getting embedding dimension: {e}")
|
57 |
+
raise e
|
58 |
+
|
59 |
+
def get_model_info(self) -> dict:
|
60 |
+
return {
|
61 |
+
'model_name': self.model_name,
|
62 |
+
'dimension': self.get_embedding_dimension(),
|
63 |
+
'is_initialized': self.embeddings is not None
|
64 |
+
}
|
65 |
+
|
66 |
+
|
src/rag_pipeline.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from typing import List, Optional, Tuple
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
6 |
+
from langchain.chains import RetrievalQA
|
7 |
+
from langchain_core.documents import Document
|
8 |
+
import google.generativeai as genai
|
9 |
+
|
10 |
+
from .document_processor import DocumentProcessor
|
11 |
+
from .embedding_manager import EmbeddingManager
|
12 |
+
from .vector_store import VectorStoreManager
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
# Load API key from .env file
|
19 |
+
google_api_key = os.environ.get("GOOGLE_API_KEY")
|
20 |
+
if not google_api_key:
|
21 |
+
raise ValueError("GOOGLE_API_KEY not found in .env file")
|
22 |
+
|
23 |
+
class RAGPipeline:
|
24 |
+
def __init__(self, api_key: Optional[str] = None, chunk_size: int = 1000, chunk_overlap: int = 200, embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", persist_directory: str = "./chroma_db", temperature: float = 0.3):
|
25 |
+
self.api_key = api_key
|
26 |
+
self.chunk_size = chunk_size
|
27 |
+
self.chunk_overlap = chunk_overlap
|
28 |
+
self.embedding_model = embedding_model
|
29 |
+
self.persist_directory = persist_directory
|
30 |
+
self.temperature = temperature
|
31 |
+
self.document_processor = None
|
32 |
+
self.embedding_manager = None
|
33 |
+
self.llm = None
|
34 |
+
self.qa_chain = None
|
35 |
+
|
36 |
+
self._initialize_components()
|
37 |
+
|
38 |
+
def _initialize_components(self):
|
39 |
+
try:
|
40 |
+
logger.info("Initializing RAG Pipeline components")
|
41 |
+
|
42 |
+
genai.configure(api_key=google_api_key)
|
43 |
+
|
44 |
+
self.document_processor = DocumentProcessor(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
|
45 |
+
self.embedding_manager = EmbeddingManager(model_name=self.embedding_model)
|
46 |
+
self.vector_store_manager = VectorStoreManager(persist_directory=self.persist_directory, embedding_function=self.embedding_manager.get_embeddings())
|
47 |
+
self.vector_store_manager.initialize_vector_store()
|
48 |
+
self.llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=self.temperature)
|
49 |
+
|
50 |
+
logger.info("RAG Pipeline components initialized successfully")
|
51 |
+
|
52 |
+
except Exception as e:
|
53 |
+
logger.error(f"Error initializing RAG Pipeline components: {e}")
|
54 |
+
raise e
|
55 |
+
|
56 |
+
def process_document(self, file_path: str) -> bool:
|
57 |
+
try:
|
58 |
+
logger.info(f"Processing document: {file_path}")
|
59 |
+
# Chunk document
|
60 |
+
chunks = self.document_processor.process_document(file_path)
|
61 |
+
if not chunks:
|
62 |
+
logger.error("No chunks generated from document")
|
63 |
+
return False
|
64 |
+
# Add chunks to vector store
|
65 |
+
success = self.vector_store_manager.add_documents(chunks)
|
66 |
+
if not success:
|
67 |
+
logger.error("Failed to add chunks to vector store")
|
68 |
+
return False
|
69 |
+
# Initialize QA chain
|
70 |
+
retriever = self.vector_store_manager.get_retriever()
|
71 |
+
self.qa_chain = RetrievalQA.from_chain_type(
|
72 |
+
llm=self.llm,
|
73 |
+
chain_type="stuff",
|
74 |
+
retriever=retriever,
|
75 |
+
return_source_documents=True
|
76 |
+
)
|
77 |
+
|
78 |
+
logger.info(f"Document processed successfully")
|
79 |
+
return True
|
80 |
+
|
81 |
+
except Exception as e:
|
82 |
+
logger.error(f"Error processing document: {e}")
|
83 |
+
return False
|
84 |
+
|
85 |
+
def query(self, question: str) -> Tuple[str, List[Document]]:
|
86 |
+
try:
|
87 |
+
if not self.qa_chain:
|
88 |
+
return "Please process a document first before asking questions.", []
|
89 |
+
logger.info(f"Processing query: '{question}'")
|
90 |
+
response = self.qa_chain({"query": question})
|
91 |
+
answer = response['result']
|
92 |
+
source_docs = response.get("source_documents", [])
|
93 |
+
logger.info(f"Query completed successfully. Answer length: {len(answer)}")
|
94 |
+
return answer, source_docs
|
95 |
+
|
96 |
+
except Exception as e:
|
97 |
+
logger.error(f"Error processing query: {e}")
|
98 |
+
return f"Error processing query: {str(e)}", []
|
99 |
+
|
100 |
+
def get_system_info(self) -> dict:
|
101 |
+
try:
|
102 |
+
info = {
|
103 |
+
'chunk_size': self.chunk_size,
|
104 |
+
'chunk_overlap': self.chunk_overlap,
|
105 |
+
'embedding_model': self.embedding_model,
|
106 |
+
'persist_directory': self.persist_directory,
|
107 |
+
'temperature': self.temperature,
|
108 |
+
'components_initialized': {
|
109 |
+
'document_processor': self.document_processor is not None,
|
110 |
+
'embedding_manager': self.embedding_manager is not None,
|
111 |
+
'vector_store_manager': self.vector_store_manager is not None,
|
112 |
+
'llm': self.llm is not None,
|
113 |
+
'qa_chain': self.qa_chain is not None
|
114 |
+
}
|
115 |
+
}
|
116 |
+
|
117 |
+
# Add embedding model info
|
118 |
+
if self.embedding_manager:
|
119 |
+
info['embedding_info'] = self.embedding_manager.get_model_info()
|
120 |
+
|
121 |
+
# Add vector store stats
|
122 |
+
if self.vector_store_manager:
|
123 |
+
info['vector_store_stats'] = self.vector_store_manager.get_collection_stats()
|
124 |
+
|
125 |
+
return info
|
126 |
+
|
127 |
+
except Exception as e:
|
128 |
+
logger.error(f"Error getting system info: {e}")
|
129 |
+
return {}
|
130 |
+
|
131 |
+
def clear_knowledge_base(self) -> bool:
|
132 |
+
try:
|
133 |
+
logger.info("Clearing knowledge base")
|
134 |
+
|
135 |
+
# Clear vector store
|
136 |
+
if self.vector_store_manager:
|
137 |
+
self.vector_store_manager.clear_vector_store()
|
138 |
+
|
139 |
+
# Reset QA chain
|
140 |
+
self.qa_chain = None
|
141 |
+
|
142 |
+
logger.info("Knowledge base cleared successfully")
|
143 |
+
return True
|
144 |
+
|
145 |
+
except Exception as e:
|
146 |
+
logger.error(f"Error clearing knowledge base: {e}")
|
147 |
+
return False
|
148 |
+
|
149 |
+
def is_ready(self) -> bool:
|
150 |
+
return (
|
151 |
+
self.document_processor is not None and
|
152 |
+
self.embedding_manager is not None and
|
153 |
+
self.vector_store_manager is not None and
|
154 |
+
self.llm is not None and
|
155 |
+
self.qa_chain is not None
|
156 |
+
)
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
|
src/ui_components.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def setup_page_config():
|
4 |
+
st.set_page_config(
|
5 |
+
page_title="RAG Document System",
|
6 |
+
page_icon="📚",
|
7 |
+
layout="centered"
|
8 |
+
)
|
9 |
+
|
10 |
+
def load_custom_css():
|
11 |
+
st.markdown("""
|
12 |
+
<style>
|
13 |
+
.info-box {
|
14 |
+
background-color: #f0f2f6;
|
15 |
+
color: #000;
|
16 |
+
padding: 0.5rem;
|
17 |
+
margin: 0.5rem;
|
18 |
+
border-radius: 0.5rem;
|
19 |
+
border-left: 4px solid #1f77b4;
|
20 |
+
}
|
21 |
+
</style>
|
22 |
+
""", unsafe_allow_html=True)
|
23 |
+
|
24 |
+
def render_header():
|
25 |
+
st.markdown('<h1 class="main-header">📚 RAG Document System</h1>', unsafe_allow_html=True)
|
26 |
+
st.markdown('<h2 class="sub-header">Upload and interact with your documents</h2>', unsafe_allow_html=True)
|
27 |
+
|
28 |
+
def render_getting_started():
|
29 |
+
st.markdown("""
|
30 |
+
<div class="info-box">
|
31 |
+
<h4>Getting Started</h4>
|
32 |
+
<p>1. Upload a text document (.txt) using the file uploader above</p>
|
33 |
+
<p>2. Wait for the document to be processed</p>
|
34 |
+
<p>3. Start asking questions about your document!</p>
|
35 |
+
</div>
|
36 |
+
""", unsafe_allow_html=True)
|
37 |
+
|
38 |
+
def render_system_info(system_info: dict):
|
39 |
+
"""Render system information"""
|
40 |
+
with st.expander("🔧 System Information"):
|
41 |
+
if not system_info:
|
42 |
+
st.info("System information not available")
|
43 |
+
return
|
44 |
+
|
45 |
+
# Basic configuration
|
46 |
+
st.markdown("**Configuration:**")
|
47 |
+
col1, col2 = st.columns(2)
|
48 |
+
|
49 |
+
with col1:
|
50 |
+
st.write(f"• Chunk Size: {system_info.get('chunk_size', 'N/A')}")
|
51 |
+
st.write(f"• Chunk Overlap: {system_info.get('chunk_overlap', 'N/A')}")
|
52 |
+
st.write(f"• Temperature: {system_info.get('temperature', 'N/A')}")
|
53 |
+
|
54 |
+
with col2:
|
55 |
+
st.write(f"• Embedding Model: {system_info.get('embedding_model', 'N/A')}")
|
56 |
+
st.write(f"• Persist Directory: {system_info.get('persist_directory', 'N/A')}")
|
57 |
+
|
58 |
+
# Component status
|
59 |
+
st.markdown("**Component Status:**")
|
60 |
+
components = system_info.get('components_initialized', {})
|
61 |
+
for component, status in components.items():
|
62 |
+
status_icon = "✅" if status else "❌"
|
63 |
+
st.write(f"{status_icon} {component.replace('_', ' ').title()}")
|
64 |
+
|
65 |
+
# Embedding info
|
66 |
+
if 'embedding_info' in system_info:
|
67 |
+
st.markdown("**Embedding Model Info:**")
|
68 |
+
embedding_info = system_info['embedding_info']
|
69 |
+
st.write(f"• Model: {embedding_info.get('model_name', 'N/A')}")
|
70 |
+
st.write(f"• Device: {embedding_info.get('device', 'N/A')}")
|
71 |
+
st.write(f"• Dimensions: {embedding_info.get('dimension', 'N/A')}")
|
72 |
+
|
73 |
+
# Vector store stats
|
74 |
+
if 'vector_store_stats' in system_info:
|
75 |
+
st.markdown("**Vector Store Stats:**")
|
76 |
+
vector_stats = system_info['vector_store_stats']
|
77 |
+
st.write(f"• Total Documents: {vector_stats.get('total_documents', 0)}")
|
78 |
+
st.write(f"• Collection: {vector_stats.get('collection_name', 'N/A')}")
|
79 |
+
|
80 |
+
def render_processing_spinner(message: str = "Processing..."):
|
81 |
+
return st.spinner(message)
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
|
src/vector_store.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from typing import List, Optional, Tuple
|
4 |
+
from langchain_chroma import Chroma
|
5 |
+
from langchain_core.documents import Document
|
6 |
+
from langchain_core.embeddings import Embeddings
|
7 |
+
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
class VectorStoreManager:
|
11 |
+
def __init__(self, persist_directory: str = "./chroma_db", embedding_function: Optional[Embeddings] = None):
|
12 |
+
self.persist_directory = persist_directory
|
13 |
+
self.embedding_function = embedding_function
|
14 |
+
self.vector_store = None
|
15 |
+
self._ensure_persist_directory()
|
16 |
+
|
17 |
+
def _ensure_persist_directory(self):
|
18 |
+
try:
|
19 |
+
os.makedirs(self.persist_directory, exist_ok=True)
|
20 |
+
logger.info(f"Persist directory ensured: {self.persist_directory}")
|
21 |
+
except Exception as e:
|
22 |
+
logger.error(f"Error creating persist directory: {e}")
|
23 |
+
raise e
|
24 |
+
|
25 |
+
def initialize_vector_store(self, embedding_function: Optional[Embeddings] = None):
|
26 |
+
if embedding_function:
|
27 |
+
self.embedding_function = embedding_function
|
28 |
+
|
29 |
+
if not self.embedding_function:
|
30 |
+
raise ValueError("Embedding function must be provided")
|
31 |
+
|
32 |
+
try:
|
33 |
+
logger.info("Initializing vector store")
|
34 |
+
self.vector_store = Chroma(
|
35 |
+
persist_directory=self.persist_directory,
|
36 |
+
embedding_function=self.embedding_function
|
37 |
+
)
|
38 |
+
logger.info("Vector store initialized successfully")
|
39 |
+
|
40 |
+
except Exception as e:
|
41 |
+
logger.error(f"Error initializing vector store: {e}")
|
42 |
+
raise e
|
43 |
+
|
44 |
+
def add_documents(self, documents: List[Document]) -> bool:
|
45 |
+
try:
|
46 |
+
if not self.vector_store:
|
47 |
+
raise ValueError("Vector store not initialized")
|
48 |
+
|
49 |
+
logger.info(f"Adding {len(documents)} document(s) to vector store")
|
50 |
+
self.vector_store.add_documents(documents)
|
51 |
+
logger.info("Documents added successfully")
|
52 |
+
return True
|
53 |
+
|
54 |
+
except Exception as e:
|
55 |
+
logger.error(f"Error adding documents to vector store: {e}")
|
56 |
+
return False
|
57 |
+
|
58 |
+
def similarity_search(self, query: str, k: int = 5) -> List[Document]:
|
59 |
+
try:
|
60 |
+
if not self.vector_store:
|
61 |
+
raise ValueError("Vector store not initialized")
|
62 |
+
|
63 |
+
logger.info(f"Performing similarity search for query: '{query[:50]}...'")
|
64 |
+
results = self.vector_store.similarity_search(query, k=k)
|
65 |
+
logger.info(f"Found {len(results)} similar documents")
|
66 |
+
return results
|
67 |
+
|
68 |
+
except Exception as e:
|
69 |
+
logger.error(f"Error performing similarity search: {e}")
|
70 |
+
return []
|
71 |
+
|
72 |
+
def similarity_search_with_score(self, query: str, k: int = 5) -> List[Tuple[Document, float]]:
|
73 |
+
try:
|
74 |
+
if not self.vector_store:
|
75 |
+
raise ValueError("Vector store not initialized")
|
76 |
+
|
77 |
+
logger.info(f"Performing similarity search with scores for query: '{query[:50]}...'")
|
78 |
+
results = self.vector_store.similarity_search_with_score(query, k=k)
|
79 |
+
logger.info(f"Found {len(results)} similar documents with scores")
|
80 |
+
return results
|
81 |
+
|
82 |
+
except Exception as e:
|
83 |
+
logger.error(f"Error performing similarity search with scores: {e}")
|
84 |
+
return []
|
85 |
+
|
86 |
+
def get_retriever(self, search_kwargs: Optional[dict] = None):
|
87 |
+
try:
|
88 |
+
if not self.vector_store:
|
89 |
+
raise ValueError("Vector store not initialized")
|
90 |
+
|
91 |
+
default_kwargs = {"k": 5}
|
92 |
+
if search_kwargs:
|
93 |
+
default_kwargs.update(search_kwargs)
|
94 |
+
|
95 |
+
retriever = self.vector_store.as_retriever(search_kwargs=default_kwargs)
|
96 |
+
logger.info("Retriever created successfully")
|
97 |
+
return retriever
|
98 |
+
|
99 |
+
except Exception as e:
|
100 |
+
logger.error(f"Error creating retriever: {e}")
|
101 |
+
raise e
|
102 |
+
|
103 |
+
def get_collection_stats(self) -> dict:
|
104 |
+
try:
|
105 |
+
if not self.vector_store:
|
106 |
+
return {'total_documents': 0, 'collection_name': None}
|
107 |
+
|
108 |
+
collection = self.vector_store._collection
|
109 |
+
count = collection.count()
|
110 |
+
|
111 |
+
return {
|
112 |
+
'total_documents': count,
|
113 |
+
'collection_name': collection.name,
|
114 |
+
'persist_directory': self.persist_directory
|
115 |
+
}
|
116 |
+
|
117 |
+
except Exception as e:
|
118 |
+
logger.error(f"Error getting collection stats: {e}")
|
119 |
+
return {'total_documents': 0, 'collection_name': None}
|
120 |
+
|
121 |
+
def clear_vector_store(self) -> bool:
|
122 |
+
try:
|
123 |
+
if not self.vector_store:
|
124 |
+
return True
|
125 |
+
|
126 |
+
logger.info("Clearing vector store")
|
127 |
+
self.vector_store._collection.delete(where={})
|
128 |
+
logger.info("Vector store cleared successfully")
|
129 |
+
return True
|
130 |
+
|
131 |
+
except Exception as e:
|
132 |
+
logger.error(f"Error clearing vector store: {e}")
|
133 |
+
return False
|
134 |
+
|
135 |
+
def is_initialized(self) -> bool:
|
136 |
+
return self.vector_store is not None
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
|