hugging2021 commited on
Commit
816825a
·
verified ·
1 Parent(s): 876494d

Upload folder using huggingface_hub

Browse files
.dockerignore ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ *.so
7
+ .git/
8
+ .gitignore
9
+ README.md
10
+ .vscode/
11
+ *.egg-info/
12
+ dist/
13
+ build/
14
+ .pytest_cache/
15
+ .coverage
16
+ .tox/
17
+ .cache
18
+ chroma_db/
19
+ documents/
.env.example ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ GOOGLE_API_KEY=your_google_api_key_here
2
+ CHUNK_SIZE=1000
3
+ CHUNK_OVERLAP=200
4
+ EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
5
+ LLM_TEMPERATURE=0.3
.gitignore ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables (keep .env.example)
2
+ .env
3
+ !.env.example
4
+
5
+ # Python
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+ *.so
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # Testing
28
+ .tox/
29
+ .coverage
30
+ .coverage.*
31
+ .cache
32
+ nosetests.xml
33
+ coverage.xml
34
+ *.cover
35
+ .hypothesis/
36
+ .pytest_cache/
37
+
38
+ # Virtual environments
39
+ venv/
40
+ env/
41
+ ENV/
42
+ env.bak/
43
+ venv.bak/
44
+
45
+ # IDE
46
+ .vscode/
47
+ .idea/
48
+ *.swp
49
+ *.swo
50
+ *~
51
+
52
+ # OS
53
+ .DS_Store
54
+ .DS_Store?
55
+ ._*
56
+ .Spotlight-V100
57
+ .Trashes
58
+ ehthumbs.db
59
+ Thumbs.db
60
+
61
+ # Application specific
62
+ chroma_db/
63
+ documents/
64
+ *.log
Dockerfile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Set environment variables
7
+ ENV PYTHONPATH=/app
8
+ ENV PYTHONDONTWRITEBYTECODE=1
9
+ ENV PYTHONUNBUFFERED=1
10
+
11
+ # Install system dependencies
12
+ RUN apt-get update && apt-get install -y \
13
+ build-essential \
14
+ curl \
15
+ software-properties-common \
16
+ git \
17
+ && rm -rf /var/lib/apt/lists/*
18
+
19
+ # Copy requirements file first (for better caching)
20
+ COPY requirements.txt .
21
+
22
+ # Install Python dependencies
23
+ RUN pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Copy source code
26
+ COPY src/ ./src/
27
+ COPY app.py .
28
+
29
+ # Create directories for data persistence
30
+ RUN mkdir -p /app/chroma_db /app/documents
31
+
32
+ # Create a non-root user
33
+ RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
34
+ USER appuser
35
+
36
+ # Expose Streamlit port
37
+ EXPOSE 8501
38
+
39
+ # Run the application
40
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 SneakyGraySnake
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,10 +1,84 @@
1
- ---
2
- title: Rag Document System
3
- emoji: 🌍
4
- colorFrom: red
5
- colorTo: green
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG Document System
2
+
3
+ A simple document interaction system using Retrieval-Augmented Generation (RAG) with Streamlit and Google's Gemini AI.
4
+
5
+ ## Features
6
+
7
+ - Upload text documents (.txt files)
8
+ - Ask questions about your documents
9
+ - Get AI-powered answers with source citations
10
+ - Persistent vector database storage
11
+ - Clean web interface
12
+
13
+ ## Setup
14
+
15
+ ### With Docker (Recommended)
16
+
17
+ 1. Clone the repository:
18
+ ```bash
19
+ git clone https://github.com/tusiim3/RAG-Document-System.git
20
+ cd RAG-Document-System
21
+ ```
22
+
23
+ 2. Copy `.env.example` to `.env` and add your Google API key:
24
+ ```bash
25
+ cp .env.example .env
26
+ ```
27
+
28
+ 3. Run with Docker Compose:
29
+ ```bash
30
+ docker-compose up --build
31
+ ```
32
+
33
+ 4. Open http://localhost:8501 in your browser
34
+
35
+ ### Without Docker
36
+
37
+ 1. Clone the repository:
38
+ ```bash
39
+ git clone https://github.com/tusiim3/RAG-Document-System.git
40
+ cd RAG-Document-System
41
+ ```
42
+
43
+ 2. Install dependencies:
44
+ ```bash
45
+ pip install -r requirements.txt
46
+ ```
47
+
48
+ 3. Copy `.env.example` to `.env` and add your Google API key:
49
+ ```bash
50
+ cp .env.example .env
51
+ ```
52
+
53
+ 4. Run the application:
54
+ ```bash
55
+ streamlit run app.py
56
+ ```
57
+
58
+ 5. Open http://localhost:8501 in your browser
59
+
60
+ ## Environment Variables
61
+
62
+ Required in `.env` file:
63
+ - `GOOGLE_API_KEY` - Your Google API key for Gemini
64
+ - `CHUNK_SIZE` - Text chunk size (default: 1000)
65
+ - `CHUNK_OVERLAP` - Chunk overlap (default: 200)
66
+ - `EMBEDDING_MODEL` - Embedding model name
67
+ - `LLM_TEMPERATURE` - AI response temperature (default: 0.3)
68
+
69
+ ## Usage
70
+
71
+ 1. Upload a text document using the file uploader
72
+ 2. Wait for document processing to complete
73
+ 3. Ask questions about the document in the chat interface
74
+ 4. View source documents for each answer
75
+
76
+ ## Technology Stack
77
+
78
+ - Streamlit for web interface
79
+ - LangChain for document processing
80
+ - ChromaDB for vector storage
81
+ - Google Gemini for AI responses
82
+ - Docker for containerization
83
+
84
+ ##
app.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ import logging
5
+ from dotenv import load_dotenv
6
+ import uuid
7
+
8
+ # UI Components moved to src/ui_components.py for easier debugging and maintenance
9
+
10
+ from src.ui_components import (
11
+ setup_page_config, load_custom_css, render_header,
12
+ render_getting_started, render_system_info,
13
+ render_processing_spinner
14
+ )
15
+ from src.rag_pipeline import RAGPipeline
16
+
17
+ load_dotenv()
18
+
19
+ # Configure logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ def initialize_session_state():
24
+ if 'session_id' not in st.session_state:
25
+ st.session_state.session_id = str(uuid.uuid4())
26
+
27
+ if 'rag_pipeline' not in st.session_state:
28
+ st.session_state.rag_pipeline = None
29
+
30
+ if 'messages' not in st.session_state:
31
+ st.session_state.messages = []
32
+
33
+ if 'rag_sources' not in st.session_state:
34
+ st.session_state.rag_sources = []
35
+
36
+ if 'document_loaded' not in st.session_state:
37
+ st.session_state.document_loaded = False
38
+
39
+ if 'document_stats' not in st.session_state:
40
+ st.session_state.document_stats = None
41
+
42
+ def process_uploaded_document(uploaded_file):
43
+ try:
44
+ st.info(f"Starting to process: {uploaded_file.name}")
45
+
46
+ # Save uploaded file temporarily
47
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.txt', mode='w', encoding='utf-8') as tmp_file:
48
+ content = uploaded_file.getvalue().decode('utf-8')
49
+ tmp_file.write(content)
50
+ tmp_file_path = tmp_file.name
51
+
52
+ st.info(f"File saved temporarily at: {tmp_file_path}")
53
+ st.info(f"File content length: {len(content)} characters")
54
+
55
+ # Initialize RAG pipeline if not already done
56
+ if st.session_state.rag_pipeline is None:
57
+ st.info("Initializing RAG pipeline...")
58
+ st.session_state.rag_pipeline = RAGPipeline()
59
+
60
+ # Process document
61
+ st.info("Processing document through RAG pipeline...")
62
+ success = st.session_state.rag_pipeline.process_document(tmp_file_path)
63
+
64
+ if success:
65
+ st.info("Document processed successfully, getting statistics...")
66
+ # Get document statistics
67
+ chunks = st.session_state.rag_pipeline.document_processor.process_document(tmp_file_path)
68
+ stats = st.session_state.rag_pipeline.document_processor.get_document_stats(chunks)
69
+
70
+ # Update session state
71
+ st.session_state.document_loaded = True
72
+ st.session_state.document_stats = stats
73
+
74
+ st.info(f"Document processed successfully: {stats['total_chunks']} chunks")
75
+ else:
76
+ st.error("Failed to process document")
77
+
78
+ # Clean up temporary file
79
+ os.unlink(tmp_file_path)
80
+
81
+ return success
82
+
83
+ except Exception as e:
84
+ st.error(f"Error processing uploaded document: {e}")
85
+ logger.error(f"Error processing uploaded document: {e}")
86
+ return False
87
+
88
+ def handle_user_query(user_question):
89
+ try:
90
+ if not st.session_state.rag_pipeline or not st.session_state.document_loaded:
91
+ return "Please upload a document first before asking questions.", []
92
+
93
+ # Add user question to messages
94
+ st.session_state.messages.append({"role": "user", "content": user_question})
95
+
96
+ # Get response from RAG pipeline
97
+ with render_processing_spinner("Thinking..."):
98
+ answer, source_docs = st.session_state.rag_pipeline.query(user_question)
99
+
100
+ # Add assistant response to messages
101
+ st.session_state.messages.append({
102
+ "role": "assistant",
103
+ "content": answer,
104
+ "sources": source_docs
105
+ })
106
+
107
+ logger.info(f"Query processed: '{user_question[:50]}...'")
108
+ return answer, source_docs
109
+
110
+ except Exception as e:
111
+ logger.error(f"Error handling user query: {e}")
112
+ error_message = f"Error processing query: {str(e)}"
113
+ st.session_state.messages.append({"role": "assistant", "content": error_message, "sources": []})
114
+ return error_message, []
115
+
116
+ def clear_all_documents():
117
+ st.session_state.rag_sources = []
118
+ st.session_state.document_loaded = False
119
+ st.session_state.document_stats = None
120
+ st.session_state.rag_pipeline = None
121
+ st.session_state.uploaded_files = []
122
+
123
+ # Clear the vector store as well
124
+ if st.session_state.rag_pipeline and st.session_state.rag_pipeline.vector_store_manager:
125
+ st.session_state.rag_pipeline.vector_store_manager.clear_vector_store()
126
+
127
+ # Increment uploader key to reset file uploader
128
+ if 'uploader_key' not in st.session_state:
129
+ st.session_state.uploader_key = 0
130
+ st.session_state.uploader_key += 1
131
+ st.rerun()
132
+
133
+ def process_uploaded_files():
134
+ if 'uploaded_files' in st.session_state and st.session_state.uploaded_files:
135
+ for uploaded_file in st.session_state.uploaded_files:
136
+ if uploaded_file.name not in st.session_state.rag_sources:
137
+ # Simple test - just read the file content first
138
+ try:
139
+ content = uploaded_file.getvalue().decode('utf-8')
140
+ st.success(f"✅ {uploaded_file.name} uploaded successfully! Content length: {len(content)} characters")
141
+ st.session_state.rag_sources.append(uploaded_file.name)
142
+
143
+ # Set document_loaded to True when we have files
144
+ st.session_state.document_loaded = True
145
+
146
+ # Now try to process with RAG pipeline
147
+ with st.spinner(f"Processing {uploaded_file.name} with RAG..."):
148
+ success = process_uploaded_document(uploaded_file)
149
+ if success:
150
+ st.success(f"✅ {uploaded_file.name} RAG processing completed!")
151
+ else:
152
+ st.error(f"❌ RAG processing failed for {uploaded_file.name}")
153
+
154
+ except Exception as e:
155
+ st.error(f"❌ Error reading {uploaded_file.name}: {e}")
156
+
157
+ # Clear the uploaded files from session state to prevent reprocessing
158
+ st.session_state.uploaded_files = []
159
+
160
+ def main():
161
+ # Setup page configuration and styling
162
+ setup_page_config()
163
+ load_custom_css()
164
+
165
+ # Initialize session state
166
+ initialize_session_state()
167
+
168
+ # Render main header
169
+ render_header()
170
+
171
+ # Add getting started section
172
+ if not st.session_state.document_loaded:
173
+ render_getting_started()
174
+
175
+ # Clear buttons
176
+ col1, col2 = st.columns(2)
177
+ with col1:
178
+ if st.button("Clear Chat", type="primary"):
179
+ st.session_state.messages.clear()
180
+ st.rerun()
181
+ with col2:
182
+ if st.button("Clear All Documents", type="secondary"):
183
+ clear_all_documents()
184
+
185
+ # Initialize uploader key
186
+ if 'uploader_key' not in st.session_state:
187
+ st.session_state.uploader_key = 0
188
+
189
+ # File upload input
190
+ uploaded_files = st.file_uploader(
191
+ "📄 Upload a text document (.txt only, max 200MB)",
192
+ type=["txt"],
193
+ accept_multiple_files=True,
194
+ key=f"rag_docs_{st.session_state.uploader_key}"
195
+ )
196
+
197
+ # Store uploaded files in session state and process them
198
+ if uploaded_files:
199
+ st.session_state.uploaded_files = uploaded_files
200
+ st.info(f"Files uploaded: {[f.name for f in uploaded_files]}")
201
+ process_uploaded_files()
202
+
203
+ # Show documents in DB with individual remove buttons
204
+ with st.expander(f"📚 Documents in DB ({len(st.session_state.rag_sources)})"):
205
+ if st.session_state.rag_sources:
206
+ for i, doc in enumerate(st.session_state.rag_sources):
207
+ col1, col2 = st.columns([3, 1])
208
+ with col1:
209
+ st.write(f"• {doc}")
210
+ with col2:
211
+ if st.button("🗑️", key=f"remove_doc_{i}_{doc}"):
212
+ # Remove the document
213
+ st.session_state.rag_sources.pop(i)
214
+ # Reset document_loaded if no documents left
215
+ if len(st.session_state.rag_sources) == 0:
216
+ st.session_state.document_loaded = False
217
+ st.session_state.document_stats = None
218
+ st.session_state.rag_pipeline = None
219
+ st.rerun()
220
+ else:
221
+ st.write("No documents in database")
222
+
223
+
224
+
225
+ # Display chat history
226
+ for message in st.session_state.messages:
227
+ with st.chat_message(message["role"]):
228
+ st.markdown(message["content"])
229
+
230
+ # Chat input
231
+ if prompt := st.chat_input("Your message"):
232
+ st.session_state.messages.append({"role": "user", "content": prompt})
233
+ with st.chat_message("user"):
234
+ st.markdown(prompt)
235
+
236
+ with st.chat_message("assistant"):
237
+ message_placeholder = st.empty()
238
+ full_response = ""
239
+
240
+ # RAG response
241
+ answer, source_docs = handle_user_query(prompt)
242
+ st.write(answer)
243
+
244
+ # Show source documents if available
245
+ if source_docs and isinstance(source_docs, list) and len(source_docs) > 0:
246
+ with st.expander("📄 View Source Documents"):
247
+ for i, doc in enumerate(source_docs[:3]): # Show top 3 sources
248
+ st.markdown(f"**Source {i+1}:**")
249
+ st.markdown(f'{doc.page_content[:300]}{"..." if len(doc.page_content) > 300 else ""}')
250
+ st.divider()
251
+
252
+ # System information
253
+ if st.session_state.rag_pipeline:
254
+ system_info = st.session_state.rag_pipeline.get_system_info()
255
+ render_system_info(system_info)
256
+
257
+
258
+ if __name__ == "__main__":
259
+ main()
docker-compose.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ rag-app:
3
+ build:
4
+ context: .
5
+ dockerfile: Dockerfile
6
+ container_name: rag-document-system
7
+ ports:
8
+ - "8501:8501"
9
+ environment:
10
+ - GOOGLE_API_KEY=${GOOGLE_API_KEY}
11
+ - CHUNK_SIZE=${CHUNK_SIZE:-1000}
12
+ - CHUNK_OVERLAP=${CHUNK_OVERLAP:-200}
13
+ - EMBEDDING_MODEL=${EMBEDDING_MODEL:-sentence-transformers/all-MiniLM-L6-v2}
14
+ - PERSIST_DIRECTORY=/app/chroma_db
15
+ - LLM_TEMPERATURE=${LLM_TEMPERATURE:-0.3}
16
+ volumes:
17
+ - ./chroma_db:/app/chroma_db
18
+ - ./documents:/app/documents
19
+ env_file:
20
+ - .env
21
+ restart: unless-stopped
22
+ healthcheck:
23
+ test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
24
+ interval: 30s
25
+ timeout: 10s
26
+ retries: 3
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.28.0
2
+ langchain>=0.1.0
3
+ langchain-huggingface>=0.0.10
4
+ langchain-community>=0.0.10
5
+ langchain-core>=0.1.0
6
+ langchain-chroma>=0.0.10
7
+ langchain-google-genai>=0.0.6
8
+ google-generativeai>=0.3.0
9
+ chromadb>=0.4.0
10
+ python-dotenv>=1.0.0
11
+ typing-extensions>=4.5.0
src/config.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from typing import Dict, Any
4
+
5
+ class Config:
6
+
7
+ # Document Procesing
8
+ DEFAULT_CHUNK_SIZE = 1000
9
+ DEFAULT_CHUNK_OVERLAP = 200
10
+ DEFAULT_ENCODING = 'utf-8'
11
+
12
+ # Embedding Model
13
+ DEFAULT_EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
14
+
15
+ # Vector Store
16
+ DEFAULT_PERSIST_DIRECTORY = "./chroma_db"
17
+ DEFAULT_RETRIEVAL_K = 5
18
+
19
+ # LLM Settings
20
+ DEFAULT_TEMPERATURE = 0.3
21
+ DEFAULT_CHAIN_TYPE = "stuff"
22
+
23
+ #File Settings
24
+ SUPPORTED_FILE_TYPES = ["txt"]
25
+ MAX_FILE_SIZE_MB = 100
26
+
27
+ @classmethod
28
+ def get_doc_processing_config(cls) -> Dict[str, Any]:
29
+ return {
30
+ 'chunk_size': int(os.getenv('CHUNK_SIZE', cls.DEFAULT_CHUNK_SIZE)),
31
+ 'chunk_overlap': int(os.getenv('CHUNK_OVERLAP', cls.DEFAULT_CHUNK_OVERLAP)),
32
+ 'encoding': os.getenv('ENCODING', cls.DEFAULT_ENCODING)
33
+ }
34
+
35
+ @classmethod
36
+ def get_embedding_config(cls) -> Dict[str, Any]:
37
+ return {
38
+ 'model_name': os.getenv('EMBEDDING_MODEL', cls.DEFAULT_EMBEDDING_MODEL),
39
+ }
40
+
41
+ @classmethod
42
+ def get_vector_store_config(cls) -> Dict[str, Any]:
43
+ return {
44
+ 'persist_directory': os.getenv('PERSIST_DIRECTORY', cls.DEFAULT_PERSIST_DIRECTORY),
45
+ 'retrieval_k': int(os.getenv('RETRIEVAL_K', cls.DEFAULT_RETRIEVAL_K))
46
+ }
47
+
48
+ @classmethod
49
+ def get_llm_config(cls) -> Dict[str, Any]:
50
+ return {
51
+ 'temperature': float(os.getenv('LLM_TEMPERATURE', cls.DEFAULT_TEMPERATURE)),
52
+ 'chain_type': os.getenv('LLM_CHAIN_TYPE', cls.DEFAULT_CHAIN_TYPE),
53
+ 'api_key': os.getenv('GOOGLE_API_KEY')
54
+ }
55
+
56
+ @classmethod
57
+ def get_file_settings(cls) -> Dict[str, Any]:
58
+ return {
59
+ 'supported_types': cls.SUPPORTED_FILE_TYPES,
60
+ 'max_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', cls.MAX_FILE_SIZE_MB))
61
+ }
62
+
63
+ @classmethod
64
+ def get_all_configs(cls) -> Dict[str, Any]:
65
+ return {
66
+ 'document_processing': cls.get_doc_processing_config(),
67
+ 'embedding': cls.get_embedding_config(),
68
+ 'vector_store': cls.get_vector_store_config(),
69
+ 'llm': cls.get_llm_config(),
70
+ 'file_settings': cls.get_file_settings()
71
+ }
72
+
73
+ @classmethod
74
+ def validate_config(cls) -> bool:
75
+ llm_config = cls.get_llm_config()
76
+
77
+ if not llm_config['api_key']:
78
+ return False
79
+
80
+ return True
81
+
82
+ @classmethod
83
+ def get_environment_info(cls) -> Dict[str, Any]:
84
+ return {
85
+ 'python_version': sys.version,
86
+ 'environment_variables': {
87
+ 'GOOGLE_API_KEY': 'SET' if os.getenv('GOOGLE_API_KEY') else 'NOT SET',
88
+ 'CHUNK_SIZE': os.getenv('CHUNK_SIZE', 'DEFAULT'),
89
+ 'EMBEDDING_MODEL': os.getenv('EMBEDDING_MODEL', 'DEFAULT'),
90
+ 'PERSIST_DIRECTORY': os.getenv('PERSIST_DIRECTORY', 'DEFAULT'),
91
+ }
92
+ }
93
+
94
+
95
+
96
+
97
+
src/document_processor.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Optional
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.document_loaders.text import TextLoader
5
+ from langchain_core.documents import Document
6
+
7
+ from .config import Config
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class DocumentProcessor:
12
+ def __init__(self, chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None):
13
+ config = Config.get_doc_processing_config()
14
+ self.chunk_size = chunk_size or config['chunk_size']
15
+ self.chunk_overlap = chunk_overlap or config['chunk_overlap']
16
+ self.text_splitter = RecursiveCharacterTextSplitter(
17
+ chunk_size=self.chunk_size,
18
+ chunk_overlap=self.chunk_overlap,
19
+ length_function=len,
20
+ separators=["\n\n", "\n", " ", ""]
21
+ )
22
+
23
+ def load_document(self, file_path: str, encoding: Optional[str] = None) -> List[Document]:
24
+ try:
25
+ config = Config.get_doc_processing_config()
26
+ encoding = encoding or config['encoding']
27
+ logger.info(f"Loading document from {file_path}")
28
+ loader = TextLoader(file_path, encoding=encoding)
29
+ documents = loader.load()
30
+ logger.info(f"Successfully loaded {len(documents)} document(s)")
31
+ return documents
32
+
33
+ except Exception as e:
34
+ logger.error(f"Error loading document from {file_path}: {e}")
35
+ raise e
36
+
37
+ def chunk_documents(self, documents: List[Document]) -> List[Document]:
38
+ try:
39
+ logger.info(f"Chunking {len(documents)} document(s)")
40
+ chunks = self.text_splitter.split_documents(documents)
41
+ logger.info(f"Successfully created {len(chunks)} chunk(s)")
42
+ return chunks
43
+
44
+ except Exception as e:
45
+ logger.error(f"Error chunking documents: {e}")
46
+ raise e
47
+
48
+ def process_document(self, file_path: str) -> List[Document]:
49
+ try:
50
+ documents = self.load_document(file_path)
51
+ chunks = self.chunk_documents(documents)
52
+ logger.info(f"Document processing completed: {len(chunks)} chunks created")
53
+ return chunks
54
+
55
+ except Exception as e:
56
+ logger.error(f"Error processing document: {e}")
57
+ raise e
58
+
59
+ def get_document_stats(self, chunks: List[Document]) -> dict:
60
+ if not chunks:
61
+ return {
62
+ 'total_chunks': 0,
63
+ 'total_characters': 0,
64
+ 'avg_chunk_size': 0,
65
+ 'min_chunk_size': 0,
66
+ 'max_chunk_size': 0
67
+ }
68
+
69
+ chunk_sizes = [len(chunk.page_content) for chunk in chunks]
70
+ total_chars = sum(chunk_sizes)
71
+
72
+ return {
73
+ 'total_chunks': len(chunks),
74
+ 'total_characters': total_chars,
75
+ 'avg_chunk_size': total_chars / len(chunks),
76
+ 'min_chunk_size': min(chunk_sizes),
77
+ 'max_chunk_size': max(chunk_sizes)
78
+ }
79
+
80
+
81
+
82
+
src/embedding_manager.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Optional
3
+ #from langchain_openai import OpenAIEmbeddings
4
+ from langchain_core.documents import Document
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class EmbeddingManager:
10
+ def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"): # text-embedding-3-small
11
+ self.model_name = model_name
12
+ self.embeddings = None
13
+ self._initialize_embeddings()
14
+
15
+ def _initialize_embeddings(self):
16
+ try:
17
+ logger.info(f"Initializing embedding model: {self.model_name}")
18
+ self.embeddings = HuggingFaceEmbeddings(model=self.model_name, model_kwargs={'device': 'cpu'})
19
+ logger.info("Embedding model initialized successfully")
20
+
21
+ except Exception as e:
22
+ logger.error(f"Error initializing embedding model: {e}")
23
+ raise e
24
+
25
+ def get_embeddings(self) -> HuggingFaceEmbeddings:
26
+ if self.embeddings is None:
27
+ self._initialize_embeddings()
28
+ return self.embeddings
29
+
30
+ def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
31
+ try:
32
+ logger.info(f"Generating embeddings for {len(texts)} text(s)")
33
+ embeddings = self.embeddings.embed_documents(texts)
34
+ logger.info(f"Successfully generated {len(embeddings)} embeddings")
35
+ return embeddings
36
+
37
+ except Exception as e:
38
+ logger.error(f"Error generating embeddings: {e}")
39
+ raise e
40
+
41
+ def generate_single_embedding(self, text: str) -> List[float]:
42
+ try:
43
+ embedding = self.embeddings.embed_query(text)
44
+ return embedding
45
+
46
+ except Exception as e:
47
+ logger.error(f"Error generating single embedding: {e}")
48
+ raise e
49
+
50
+ def get_embedding_dimension(self) -> int:
51
+ try:
52
+ test_embedding = self.generate_single_embedding("test")
53
+ return len(test_embedding)
54
+
55
+ except Exception as e:
56
+ logger.error(f"Error getting embedding dimension: {e}")
57
+ raise e
58
+
59
+ def get_model_info(self) -> dict:
60
+ return {
61
+ 'model_name': self.model_name,
62
+ 'dimension': self.get_embedding_dimension(),
63
+ 'is_initialized': self.embeddings is not None
64
+ }
65
+
66
+
src/rag_pipeline.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import List, Optional, Tuple
4
+ from dotenv import load_dotenv
5
+ from langchain_google_genai import ChatGoogleGenerativeAI
6
+ from langchain.chains import RetrievalQA
7
+ from langchain_core.documents import Document
8
+ import google.generativeai as genai
9
+
10
+ from .document_processor import DocumentProcessor
11
+ from .embedding_manager import EmbeddingManager
12
+ from .vector_store import VectorStoreManager
13
+
14
+ load_dotenv()
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Load API key from .env file
19
+ google_api_key = os.environ.get("GOOGLE_API_KEY")
20
+ if not google_api_key:
21
+ raise ValueError("GOOGLE_API_KEY not found in .env file")
22
+
23
+ class RAGPipeline:
24
+ def __init__(self, api_key: Optional[str] = None, chunk_size: int = 1000, chunk_overlap: int = 200, embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", persist_directory: str = "./chroma_db", temperature: float = 0.3):
25
+ self.api_key = api_key
26
+ self.chunk_size = chunk_size
27
+ self.chunk_overlap = chunk_overlap
28
+ self.embedding_model = embedding_model
29
+ self.persist_directory = persist_directory
30
+ self.temperature = temperature
31
+ self.document_processor = None
32
+ self.embedding_manager = None
33
+ self.llm = None
34
+ self.qa_chain = None
35
+
36
+ self._initialize_components()
37
+
38
+ def _initialize_components(self):
39
+ try:
40
+ logger.info("Initializing RAG Pipeline components")
41
+
42
+ genai.configure(api_key=google_api_key)
43
+
44
+ self.document_processor = DocumentProcessor(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
45
+ self.embedding_manager = EmbeddingManager(model_name=self.embedding_model)
46
+ self.vector_store_manager = VectorStoreManager(persist_directory=self.persist_directory, embedding_function=self.embedding_manager.get_embeddings())
47
+ self.vector_store_manager.initialize_vector_store()
48
+ self.llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=self.temperature)
49
+
50
+ logger.info("RAG Pipeline components initialized successfully")
51
+
52
+ except Exception as e:
53
+ logger.error(f"Error initializing RAG Pipeline components: {e}")
54
+ raise e
55
+
56
+ def process_document(self, file_path: str) -> bool:
57
+ try:
58
+ logger.info(f"Processing document: {file_path}")
59
+ # Chunk document
60
+ chunks = self.document_processor.process_document(file_path)
61
+ if not chunks:
62
+ logger.error("No chunks generated from document")
63
+ return False
64
+ # Add chunks to vector store
65
+ success = self.vector_store_manager.add_documents(chunks)
66
+ if not success:
67
+ logger.error("Failed to add chunks to vector store")
68
+ return False
69
+ # Initialize QA chain
70
+ retriever = self.vector_store_manager.get_retriever()
71
+ self.qa_chain = RetrievalQA.from_chain_type(
72
+ llm=self.llm,
73
+ chain_type="stuff",
74
+ retriever=retriever,
75
+ return_source_documents=True
76
+ )
77
+
78
+ logger.info(f"Document processed successfully")
79
+ return True
80
+
81
+ except Exception as e:
82
+ logger.error(f"Error processing document: {e}")
83
+ return False
84
+
85
+ def query(self, question: str) -> Tuple[str, List[Document]]:
86
+ try:
87
+ if not self.qa_chain:
88
+ return "Please process a document first before asking questions.", []
89
+ logger.info(f"Processing query: '{question}'")
90
+ response = self.qa_chain({"query": question})
91
+ answer = response['result']
92
+ source_docs = response.get("source_documents", [])
93
+ logger.info(f"Query completed successfully. Answer length: {len(answer)}")
94
+ return answer, source_docs
95
+
96
+ except Exception as e:
97
+ logger.error(f"Error processing query: {e}")
98
+ return f"Error processing query: {str(e)}", []
99
+
100
+ def get_system_info(self) -> dict:
101
+ try:
102
+ info = {
103
+ 'chunk_size': self.chunk_size,
104
+ 'chunk_overlap': self.chunk_overlap,
105
+ 'embedding_model': self.embedding_model,
106
+ 'persist_directory': self.persist_directory,
107
+ 'temperature': self.temperature,
108
+ 'components_initialized': {
109
+ 'document_processor': self.document_processor is not None,
110
+ 'embedding_manager': self.embedding_manager is not None,
111
+ 'vector_store_manager': self.vector_store_manager is not None,
112
+ 'llm': self.llm is not None,
113
+ 'qa_chain': self.qa_chain is not None
114
+ }
115
+ }
116
+
117
+ # Add embedding model info
118
+ if self.embedding_manager:
119
+ info['embedding_info'] = self.embedding_manager.get_model_info()
120
+
121
+ # Add vector store stats
122
+ if self.vector_store_manager:
123
+ info['vector_store_stats'] = self.vector_store_manager.get_collection_stats()
124
+
125
+ return info
126
+
127
+ except Exception as e:
128
+ logger.error(f"Error getting system info: {e}")
129
+ return {}
130
+
131
+ def clear_knowledge_base(self) -> bool:
132
+ try:
133
+ logger.info("Clearing knowledge base")
134
+
135
+ # Clear vector store
136
+ if self.vector_store_manager:
137
+ self.vector_store_manager.clear_vector_store()
138
+
139
+ # Reset QA chain
140
+ self.qa_chain = None
141
+
142
+ logger.info("Knowledge base cleared successfully")
143
+ return True
144
+
145
+ except Exception as e:
146
+ logger.error(f"Error clearing knowledge base: {e}")
147
+ return False
148
+
149
+ def is_ready(self) -> bool:
150
+ return (
151
+ self.document_processor is not None and
152
+ self.embedding_manager is not None and
153
+ self.vector_store_manager is not None and
154
+ self.llm is not None and
155
+ self.qa_chain is not None
156
+ )
157
+
158
+
159
+
160
+
161
+
src/ui_components.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def setup_page_config():
4
+ st.set_page_config(
5
+ page_title="RAG Document System",
6
+ page_icon="📚",
7
+ layout="centered"
8
+ )
9
+
10
+ def load_custom_css():
11
+ st.markdown("""
12
+ <style>
13
+ .info-box {
14
+ background-color: #f0f2f6;
15
+ color: #000;
16
+ padding: 0.5rem;
17
+ margin: 0.5rem;
18
+ border-radius: 0.5rem;
19
+ border-left: 4px solid #1f77b4;
20
+ }
21
+ </style>
22
+ """, unsafe_allow_html=True)
23
+
24
+ def render_header():
25
+ st.markdown('<h1 class="main-header">📚 RAG Document System</h1>', unsafe_allow_html=True)
26
+ st.markdown('<h2 class="sub-header">Upload and interact with your documents</h2>', unsafe_allow_html=True)
27
+
28
+ def render_getting_started():
29
+ st.markdown("""
30
+ <div class="info-box">
31
+ <h4>Getting Started</h4>
32
+ <p>1. Upload a text document (.txt) using the file uploader above</p>
33
+ <p>2. Wait for the document to be processed</p>
34
+ <p>3. Start asking questions about your document!</p>
35
+ </div>
36
+ """, unsafe_allow_html=True)
37
+
38
+ def render_system_info(system_info: dict):
39
+ """Render system information"""
40
+ with st.expander("🔧 System Information"):
41
+ if not system_info:
42
+ st.info("System information not available")
43
+ return
44
+
45
+ # Basic configuration
46
+ st.markdown("**Configuration:**")
47
+ col1, col2 = st.columns(2)
48
+
49
+ with col1:
50
+ st.write(f"• Chunk Size: {system_info.get('chunk_size', 'N/A')}")
51
+ st.write(f"• Chunk Overlap: {system_info.get('chunk_overlap', 'N/A')}")
52
+ st.write(f"• Temperature: {system_info.get('temperature', 'N/A')}")
53
+
54
+ with col2:
55
+ st.write(f"• Embedding Model: {system_info.get('embedding_model', 'N/A')}")
56
+ st.write(f"• Persist Directory: {system_info.get('persist_directory', 'N/A')}")
57
+
58
+ # Component status
59
+ st.markdown("**Component Status:**")
60
+ components = system_info.get('components_initialized', {})
61
+ for component, status in components.items():
62
+ status_icon = "✅" if status else "❌"
63
+ st.write(f"{status_icon} {component.replace('_', ' ').title()}")
64
+
65
+ # Embedding info
66
+ if 'embedding_info' in system_info:
67
+ st.markdown("**Embedding Model Info:**")
68
+ embedding_info = system_info['embedding_info']
69
+ st.write(f"• Model: {embedding_info.get('model_name', 'N/A')}")
70
+ st.write(f"• Device: {embedding_info.get('device', 'N/A')}")
71
+ st.write(f"• Dimensions: {embedding_info.get('dimension', 'N/A')}")
72
+
73
+ # Vector store stats
74
+ if 'vector_store_stats' in system_info:
75
+ st.markdown("**Vector Store Stats:**")
76
+ vector_stats = system_info['vector_store_stats']
77
+ st.write(f"• Total Documents: {vector_stats.get('total_documents', 0)}")
78
+ st.write(f"• Collection: {vector_stats.get('collection_name', 'N/A')}")
79
+
80
+ def render_processing_spinner(message: str = "Processing..."):
81
+ return st.spinner(message)
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
src/vector_store.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import List, Optional, Tuple
4
+ from langchain_chroma import Chroma
5
+ from langchain_core.documents import Document
6
+ from langchain_core.embeddings import Embeddings
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class VectorStoreManager:
11
+ def __init__(self, persist_directory: str = "./chroma_db", embedding_function: Optional[Embeddings] = None):
12
+ self.persist_directory = persist_directory
13
+ self.embedding_function = embedding_function
14
+ self.vector_store = None
15
+ self._ensure_persist_directory()
16
+
17
+ def _ensure_persist_directory(self):
18
+ try:
19
+ os.makedirs(self.persist_directory, exist_ok=True)
20
+ logger.info(f"Persist directory ensured: {self.persist_directory}")
21
+ except Exception as e:
22
+ logger.error(f"Error creating persist directory: {e}")
23
+ raise e
24
+
25
+ def initialize_vector_store(self, embedding_function: Optional[Embeddings] = None):
26
+ if embedding_function:
27
+ self.embedding_function = embedding_function
28
+
29
+ if not self.embedding_function:
30
+ raise ValueError("Embedding function must be provided")
31
+
32
+ try:
33
+ logger.info("Initializing vector store")
34
+ self.vector_store = Chroma(
35
+ persist_directory=self.persist_directory,
36
+ embedding_function=self.embedding_function
37
+ )
38
+ logger.info("Vector store initialized successfully")
39
+
40
+ except Exception as e:
41
+ logger.error(f"Error initializing vector store: {e}")
42
+ raise e
43
+
44
+ def add_documents(self, documents: List[Document]) -> bool:
45
+ try:
46
+ if not self.vector_store:
47
+ raise ValueError("Vector store not initialized")
48
+
49
+ logger.info(f"Adding {len(documents)} document(s) to vector store")
50
+ self.vector_store.add_documents(documents)
51
+ logger.info("Documents added successfully")
52
+ return True
53
+
54
+ except Exception as e:
55
+ logger.error(f"Error adding documents to vector store: {e}")
56
+ return False
57
+
58
+ def similarity_search(self, query: str, k: int = 5) -> List[Document]:
59
+ try:
60
+ if not self.vector_store:
61
+ raise ValueError("Vector store not initialized")
62
+
63
+ logger.info(f"Performing similarity search for query: '{query[:50]}...'")
64
+ results = self.vector_store.similarity_search(query, k=k)
65
+ logger.info(f"Found {len(results)} similar documents")
66
+ return results
67
+
68
+ except Exception as e:
69
+ logger.error(f"Error performing similarity search: {e}")
70
+ return []
71
+
72
+ def similarity_search_with_score(self, query: str, k: int = 5) -> List[Tuple[Document, float]]:
73
+ try:
74
+ if not self.vector_store:
75
+ raise ValueError("Vector store not initialized")
76
+
77
+ logger.info(f"Performing similarity search with scores for query: '{query[:50]}...'")
78
+ results = self.vector_store.similarity_search_with_score(query, k=k)
79
+ logger.info(f"Found {len(results)} similar documents with scores")
80
+ return results
81
+
82
+ except Exception as e:
83
+ logger.error(f"Error performing similarity search with scores: {e}")
84
+ return []
85
+
86
+ def get_retriever(self, search_kwargs: Optional[dict] = None):
87
+ try:
88
+ if not self.vector_store:
89
+ raise ValueError("Vector store not initialized")
90
+
91
+ default_kwargs = {"k": 5}
92
+ if search_kwargs:
93
+ default_kwargs.update(search_kwargs)
94
+
95
+ retriever = self.vector_store.as_retriever(search_kwargs=default_kwargs)
96
+ logger.info("Retriever created successfully")
97
+ return retriever
98
+
99
+ except Exception as e:
100
+ logger.error(f"Error creating retriever: {e}")
101
+ raise e
102
+
103
+ def get_collection_stats(self) -> dict:
104
+ try:
105
+ if not self.vector_store:
106
+ return {'total_documents': 0, 'collection_name': None}
107
+
108
+ collection = self.vector_store._collection
109
+ count = collection.count()
110
+
111
+ return {
112
+ 'total_documents': count,
113
+ 'collection_name': collection.name,
114
+ 'persist_directory': self.persist_directory
115
+ }
116
+
117
+ except Exception as e:
118
+ logger.error(f"Error getting collection stats: {e}")
119
+ return {'total_documents': 0, 'collection_name': None}
120
+
121
+ def clear_vector_store(self) -> bool:
122
+ try:
123
+ if not self.vector_store:
124
+ return True
125
+
126
+ logger.info("Clearing vector store")
127
+ self.vector_store._collection.delete(where={})
128
+ logger.info("Vector store cleared successfully")
129
+ return True
130
+
131
+ except Exception as e:
132
+ logger.error(f"Error clearing vector store: {e}")
133
+ return False
134
+
135
+ def is_initialized(self) -> bool:
136
+ return self.vector_store is not None
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+