37-AN
commited on
Commit
·
28ff371
1
Parent(s):
f8ed285
Fix AI responses and file uploading functionality
Browse files- Improved AI responses with better prompt formatting and instructions
- Enhanced file upload handling with better error recovery
- Added support for more file types (docx, html, md, etc.)
- Improved UI with progress tracking and better error messages
- Fixed edge cases with empty files and error handling
- app/core/ingestion.py +107 -24
- app/core/memory.py +46 -7
- app/ui/streamlit_app.py +84 -10
app/core/ingestion.py
CHANGED
@@ -3,11 +3,14 @@ import sys
|
|
3 |
import logging
|
4 |
import time
|
5 |
import random
|
|
|
6 |
from typing import List, Dict, Any
|
7 |
from langchain.document_loaders import (
|
8 |
PyPDFLoader,
|
9 |
TextLoader,
|
10 |
-
CSVLoader
|
|
|
|
|
11 |
)
|
12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
|
@@ -42,36 +45,86 @@ class DocumentProcessor:
|
|
42 |
|
43 |
logger.info(f"Processing file: {file_path} with extension {extension}")
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
# Load the file using the appropriate loader
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
def _retry_operation(self, operation, max_retries=3):
|
63 |
"""Retry an operation with exponential backoff."""
|
|
|
64 |
for attempt in range(max_retries):
|
65 |
try:
|
66 |
return operation()
|
67 |
except Exception as e:
|
|
|
68 |
if "already accessed by another instance" in str(e) and attempt < max_retries - 1:
|
69 |
wait_time = random.uniform(0.5, 2.0) * (attempt + 1)
|
70 |
logger.warning(f"Vector store access conflict, retrying ({attempt+1}/{max_retries}) in {wait_time:.2f}s...")
|
71 |
time.sleep(wait_time)
|
|
|
|
|
|
|
|
|
|
|
72 |
else:
|
73 |
# Different error or last attempt, re-raise
|
74 |
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
def ingest_file(self, file_path: str, metadata: Dict[str, Any] = None) -> List[str]:
|
77 |
"""Ingest a file into the vector database."""
|
@@ -86,7 +139,8 @@ class DocumentProcessor:
|
|
86 |
# Add file path to metadata
|
87 |
base_metadata = {
|
88 |
"source": file_path,
|
89 |
-
"file_name": os.path.basename(file_path)
|
|
|
90 |
}
|
91 |
base_metadata.update(metadata)
|
92 |
|
@@ -99,26 +153,43 @@ class DocumentProcessor:
|
|
99 |
if hasattr(chunk, 'metadata'):
|
100 |
chunk_metadata.update(chunk.metadata)
|
101 |
chunk_metadata["chunk_id"] = i
|
|
|
102 |
metadatas.append(chunk_metadata)
|
103 |
|
104 |
# Store in vector database with retry mechanism
|
105 |
logger.info(f"Adding {len(texts)} chunks to vector database")
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
def add_to_vectordb():
|
108 |
return self.memory_manager.add_texts(texts, metadatas)
|
109 |
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
return ids
|
114 |
except Exception as e:
|
115 |
logger.error(f"Error ingesting file {file_path}: {str(e)}")
|
|
|
116 |
# Return placeholder IDs if there's an error
|
117 |
-
return [f"error-{random.randint(1000, 9999)}"
|
118 |
|
119 |
def ingest_text(self, text: str, metadata: Dict[str, Any] = None) -> List[str]:
|
120 |
"""Ingest raw text into the vector database."""
|
121 |
try:
|
|
|
|
|
|
|
|
|
122 |
if metadata is None:
|
123 |
metadata = {}
|
124 |
|
@@ -126,23 +197,35 @@ class DocumentProcessor:
|
|
126 |
chunks = self.text_splitter.split_text(text)
|
127 |
logger.info(f"Split text into {len(chunks)} chunks")
|
128 |
|
|
|
|
|
|
|
|
|
129 |
# Prepare metadatas
|
130 |
metadatas = []
|
131 |
for i in range(len(chunks)):
|
132 |
chunk_metadata = metadata.copy()
|
133 |
chunk_metadata["chunk_id"] = i
|
|
|
134 |
chunk_metadata["source"] = "direct_input"
|
|
|
135 |
metadatas.append(chunk_metadata)
|
136 |
|
137 |
# Store in vector database with retry mechanism
|
138 |
def add_to_vectordb():
|
139 |
return self.memory_manager.add_texts(chunks, metadatas)
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
144 |
return ids
|
145 |
except Exception as e:
|
146 |
logger.error(f"Error ingesting text: {str(e)}")
|
|
|
147 |
# Return placeholder IDs if there's an error
|
148 |
-
return [f"error-{random.randint(1000, 9999)}"
|
|
|
3 |
import logging
|
4 |
import time
|
5 |
import random
|
6 |
+
import traceback
|
7 |
from typing import List, Dict, Any
|
8 |
from langchain.document_loaders import (
|
9 |
PyPDFLoader,
|
10 |
TextLoader,
|
11 |
+
CSVLoader,
|
12 |
+
UnstructuredFileLoader,
|
13 |
+
Docx2txtLoader
|
14 |
)
|
15 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
16 |
|
|
|
45 |
|
46 |
logger.info(f"Processing file: {file_path} with extension {extension}")
|
47 |
|
48 |
+
# Verify file is readable
|
49 |
+
try:
|
50 |
+
with open(file_path, 'rb') as f:
|
51 |
+
# Just check if we can read from it
|
52 |
+
f.read(1)
|
53 |
+
except Exception as e:
|
54 |
+
logger.error(f"Cannot read file {file_path}: {e}")
|
55 |
+
raise IOError(f"File {file_path} exists but cannot be read: {str(e)}")
|
56 |
+
|
57 |
# Load the file using the appropriate loader
|
58 |
+
try:
|
59 |
+
if extension == '.pdf':
|
60 |
+
loader = PyPDFLoader(file_path)
|
61 |
+
elif extension == '.txt':
|
62 |
+
loader = TextLoader(file_path)
|
63 |
+
elif extension == '.csv':
|
64 |
+
loader = CSVLoader(file_path)
|
65 |
+
elif extension in ['.doc', '.docx']:
|
66 |
+
loader = Docx2txtLoader(file_path)
|
67 |
+
elif extension in ['.md', '.html', '.htm', '.xml', '.json']:
|
68 |
+
# Dedicated loaders could be added for these formats
|
69 |
+
loader = TextLoader(file_path)
|
70 |
+
else:
|
71 |
+
# Try generic loader as fallback for unsupported types
|
72 |
+
logger.warning(f"No specific loader for {extension}, trying UnstructuredFileLoader")
|
73 |
+
loader = UnstructuredFileLoader(file_path)
|
74 |
+
|
75 |
+
# Load and split the documents
|
76 |
+
documents = loader.load()
|
77 |
+
|
78 |
+
if not documents:
|
79 |
+
logger.warning(f"No content extracted from {file_path}")
|
80 |
+
# Create a minimal document if empty to avoid errors
|
81 |
+
from langchain.schema import Document
|
82 |
+
documents = [Document(page_content=f"Empty file: {os.path.basename(file_path)}",
|
83 |
+
metadata={"source": file_path})]
|
84 |
+
|
85 |
+
chunks = self.text_splitter.split_documents(documents)
|
86 |
+
|
87 |
+
logger.info(f"Split file into {len(chunks)} chunks")
|
88 |
+
return chunks
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
logger.error(f"Error in document processing: {str(e)}")
|
92 |
+
logger.error(traceback.format_exc())
|
93 |
+
|
94 |
+
# Create a minimal document to represent the error
|
95 |
+
from langchain.schema import Document
|
96 |
+
error_doc = Document(
|
97 |
+
page_content=f"Error processing file {os.path.basename(file_path)}: {str(e)}",
|
98 |
+
metadata={"source": file_path, "error": str(e)}
|
99 |
+
)
|
100 |
+
return [error_doc]
|
101 |
|
102 |
def _retry_operation(self, operation, max_retries=3):
|
103 |
"""Retry an operation with exponential backoff."""
|
104 |
+
last_exception = None
|
105 |
for attempt in range(max_retries):
|
106 |
try:
|
107 |
return operation()
|
108 |
except Exception as e:
|
109 |
+
last_exception = e
|
110 |
if "already accessed by another instance" in str(e) and attempt < max_retries - 1:
|
111 |
wait_time = random.uniform(0.5, 2.0) * (attempt + 1)
|
112 |
logger.warning(f"Vector store access conflict, retrying ({attempt+1}/{max_retries}) in {wait_time:.2f}s...")
|
113 |
time.sleep(wait_time)
|
114 |
+
elif attempt < max_retries - 1:
|
115 |
+
# For other errors, also retry but with different message
|
116 |
+
wait_time = random.uniform(0.5, 2.0) * (attempt + 1)
|
117 |
+
logger.warning(f"Operation failed ({str(e)}), retrying ({attempt+1}/{max_retries}) in {wait_time:.2f}s...")
|
118 |
+
time.sleep(wait_time)
|
119 |
else:
|
120 |
# Different error or last attempt, re-raise
|
121 |
raise
|
122 |
+
|
123 |
+
# If we get here with a last_exception, re-raise it
|
124 |
+
if last_exception:
|
125 |
+
raise last_exception
|
126 |
+
else:
|
127 |
+
raise RuntimeError("Retry operation failed but no exception was captured")
|
128 |
|
129 |
def ingest_file(self, file_path: str, metadata: Dict[str, Any] = None) -> List[str]:
|
130 |
"""Ingest a file into the vector database."""
|
|
|
139 |
# Add file path to metadata
|
140 |
base_metadata = {
|
141 |
"source": file_path,
|
142 |
+
"file_name": os.path.basename(file_path),
|
143 |
+
"ingestion_time": time.strftime("%Y-%m-%d %H:%M:%S")
|
144 |
}
|
145 |
base_metadata.update(metadata)
|
146 |
|
|
|
153 |
if hasattr(chunk, 'metadata'):
|
154 |
chunk_metadata.update(chunk.metadata)
|
155 |
chunk_metadata["chunk_id"] = i
|
156 |
+
chunk_metadata["total_chunks"] = len(chunks)
|
157 |
metadatas.append(chunk_metadata)
|
158 |
|
159 |
# Store in vector database with retry mechanism
|
160 |
logger.info(f"Adding {len(texts)} chunks to vector database")
|
161 |
|
162 |
+
# Handle empty texts to avoid errors
|
163 |
+
if not texts:
|
164 |
+
logger.warning("No text chunks extracted from file, adding placeholder")
|
165 |
+
texts = [f"Empty file: {os.path.basename(file_path)}"]
|
166 |
+
metadatas = [{"source": file_path, "file_name": os.path.basename(file_path), "empty_file": True}]
|
167 |
+
|
168 |
def add_to_vectordb():
|
169 |
return self.memory_manager.add_texts(texts, metadatas)
|
170 |
|
171 |
+
try:
|
172 |
+
ids = self._retry_operation(add_to_vectordb)
|
173 |
+
logger.info(f"Successfully added chunks with IDs: {ids[:3] if len(ids) > 3 else ids}...")
|
174 |
+
except Exception as e:
|
175 |
+
logger.error(f"All attempts to add to vector DB failed: {e}")
|
176 |
+
# Return placeholder IDs
|
177 |
+
ids = [f"error-{random.randint(1000, 9999)}" for _ in range(len(texts))]
|
178 |
|
179 |
return ids
|
180 |
except Exception as e:
|
181 |
logger.error(f"Error ingesting file {file_path}: {str(e)}")
|
182 |
+
logger.error(traceback.format_exc())
|
183 |
# Return placeholder IDs if there's an error
|
184 |
+
return [f"error-{random.randint(1000, 9999)}"]
|
185 |
|
186 |
def ingest_text(self, text: str, metadata: Dict[str, Any] = None) -> List[str]:
|
187 |
"""Ingest raw text into the vector database."""
|
188 |
try:
|
189 |
+
if not text.strip():
|
190 |
+
logger.warning("Empty text provided for ingestion")
|
191 |
+
return ["empty-text-error"]
|
192 |
+
|
193 |
if metadata is None:
|
194 |
metadata = {}
|
195 |
|
|
|
197 |
chunks = self.text_splitter.split_text(text)
|
198 |
logger.info(f"Split text into {len(chunks)} chunks")
|
199 |
|
200 |
+
# If text splitting produced no chunks (unusual), create one
|
201 |
+
if not chunks:
|
202 |
+
chunks = ["Empty text input"]
|
203 |
+
|
204 |
# Prepare metadatas
|
205 |
metadatas = []
|
206 |
for i in range(len(chunks)):
|
207 |
chunk_metadata = metadata.copy()
|
208 |
chunk_metadata["chunk_id"] = i
|
209 |
+
chunk_metadata["total_chunks"] = len(chunks)
|
210 |
chunk_metadata["source"] = "direct_input"
|
211 |
+
chunk_metadata["ingestion_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
|
212 |
metadatas.append(chunk_metadata)
|
213 |
|
214 |
# Store in vector database with retry mechanism
|
215 |
def add_to_vectordb():
|
216 |
return self.memory_manager.add_texts(chunks, metadatas)
|
217 |
|
218 |
+
try:
|
219 |
+
ids = self._retry_operation(add_to_vectordb)
|
220 |
+
logger.info(f"Successfully added text chunks with IDs: {ids[:3] if len(ids) > 3 else ids}...")
|
221 |
+
except Exception as e:
|
222 |
+
logger.error(f"All attempts to add text to vector DB failed: {e}")
|
223 |
+
# Return placeholder IDs
|
224 |
+
ids = [f"error-{random.randint(1000, 9999)}" for _ in range(len(chunks))]
|
225 |
+
|
226 |
return ids
|
227 |
except Exception as e:
|
228 |
logger.error(f"Error ingesting text: {str(e)}")
|
229 |
+
logger.error(traceback.format_exc())
|
230 |
# Return placeholder IDs if there's an error
|
231 |
+
return [f"error-{random.randint(1000, 9999)}"]
|
app/core/memory.py
CHANGED
@@ -166,15 +166,28 @@ class MemoryManager:
|
|
166 |
relevant_docs = retriever.get_relevant_documents(question)
|
167 |
|
168 |
# Format the context from relevant documents
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
# Get chat history from memory
|
172 |
chat_history = self.memory.chat_memory.messages
|
173 |
chat_history_str = "\n".join([f"{msg.type}: {msg.content}" for msg in chat_history])
|
174 |
|
175 |
-
# Create the prompt
|
176 |
-
prompt = f"""You are a helpful AI assistant. Answer the following question based on the provided context.
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
Context:
|
179 |
{context}
|
180 |
|
@@ -184,8 +197,34 @@ Chat History:
|
|
184 |
Question: {question}
|
185 |
Answer:"""
|
186 |
|
187 |
-
# Get the answer from the LLM
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
return {
|
191 |
"answer": answer,
|
@@ -194,7 +233,7 @@ Answer:"""
|
|
194 |
except Exception as e:
|
195 |
logger.error(f"Error in simple_chain: {e}")
|
196 |
return {
|
197 |
-
"answer": f"I encountered an error
|
198 |
"sources": []
|
199 |
}
|
200 |
|
|
|
166 |
relevant_docs = retriever.get_relevant_documents(question)
|
167 |
|
168 |
# Format the context from relevant documents
|
169 |
+
context_parts = []
|
170 |
+
for i, doc in enumerate(relevant_docs):
|
171 |
+
source_name = doc.metadata.get("file_name", "Unknown Source")
|
172 |
+
context_parts.append(f"Document {i+1} [{source_name}]:\n{doc.page_content}\n")
|
173 |
+
|
174 |
+
context = "\n".join(context_parts) if context_parts else "No relevant documents found."
|
175 |
|
176 |
# Get chat history from memory
|
177 |
chat_history = self.memory.chat_memory.messages
|
178 |
chat_history_str = "\n".join([f"{msg.type}: {msg.content}" for msg in chat_history])
|
179 |
|
180 |
+
# Create the improved prompt with better instructions
|
181 |
+
prompt = f"""You are a helpful, accurate, and precise AI assistant. Answer the following question based on the provided context.
|
182 |
+
|
183 |
+
Follow these guidelines when responding:
|
184 |
+
1. If the context contains relevant information, use it to provide a direct and specific answer.
|
185 |
+
2. Format your answer in clear, concise paragraphs with appropriate spacing.
|
186 |
+
3. If the answer is not in the context, acknowledge this and provide a general response based on your knowledge.
|
187 |
+
4. Do not mention "context" or "documents" in your answer - integrate the information naturally.
|
188 |
+
5. Keep answers factual, helpful, and to the point.
|
189 |
+
6. Never make up information that isn't supported by the context.
|
190 |
+
|
191 |
Context:
|
192 |
{context}
|
193 |
|
|
|
197 |
Question: {question}
|
198 |
Answer:"""
|
199 |
|
200 |
+
# Get the answer from the LLM with a timeout and retries
|
201 |
+
try:
|
202 |
+
answer = self.llm(prompt)
|
203 |
+
|
204 |
+
# Simple quality check - if too short or generic, try again
|
205 |
+
if len(answer.strip()) < 20 or "I don't have enough information" in answer:
|
206 |
+
logger.info("Answer quality check failed, retrying with modified prompt")
|
207 |
+
|
208 |
+
# Add a more specific instruction to the prompt
|
209 |
+
enhanced_prompt = prompt + "\n\nPlease be as helpful as possible with the information available."
|
210 |
+
second_attempt = self.llm(enhanced_prompt)
|
211 |
+
|
212 |
+
# Use the better of the two responses
|
213 |
+
if len(second_attempt.strip()) > len(answer.strip()):
|
214 |
+
answer = second_attempt
|
215 |
+
except Exception as llm_error:
|
216 |
+
logger.error(f"Error getting answer from LLM: {llm_error}")
|
217 |
+
if not answer: # If answer wasn't set due to first attempt exception
|
218 |
+
answer = f"I'm having trouble generating a response right now. Please try again in a moment."
|
219 |
+
|
220 |
+
# Perform basic formatting cleanup
|
221 |
+
answer = answer.strip()
|
222 |
+
|
223 |
+
# Remove common prefixes that models sometimes add
|
224 |
+
prefixes_to_remove = ["Answer:", "AI:", "Assistant:"]
|
225 |
+
for prefix in prefixes_to_remove:
|
226 |
+
if answer.startswith(prefix):
|
227 |
+
answer = answer[len(prefix):].strip()
|
228 |
|
229 |
return {
|
230 |
"answer": answer,
|
|
|
233 |
except Exception as e:
|
234 |
logger.error(f"Error in simple_chain: {e}")
|
235 |
return {
|
236 |
+
"answer": f"I encountered an error while processing your question. Please try again with a different query.",
|
237 |
"sources": []
|
238 |
}
|
239 |
|
app/ui/streamlit_app.py
CHANGED
@@ -92,14 +92,42 @@ with st.sidebar:
|
|
92 |
|
93 |
# Add file uploader with error handling
|
94 |
try:
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
if uploaded_file is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
# Handle the uploaded file
|
99 |
if st.button("Process Document"):
|
100 |
with st.spinner("Processing document..."):
|
|
|
|
|
|
|
101 |
try:
|
102 |
# Create a temporary file with proper error handling
|
|
|
103 |
temp_dir = tempfile.gettempdir()
|
104 |
temp_path = os.path.join(temp_dir, uploaded_file.name)
|
105 |
|
@@ -110,6 +138,7 @@ with st.sidebar:
|
|
110 |
temp_file.write(uploaded_file.getvalue())
|
111 |
|
112 |
# Get a path to store the document permanently
|
|
|
113 |
doc_path = get_document_path(uploaded_file.name)
|
114 |
|
115 |
# Copy the file to the documents directory
|
@@ -119,22 +148,32 @@ with st.sidebar:
|
|
119 |
if not copy_success:
|
120 |
logger.warning("Using temporary file path instead of documents directory")
|
121 |
doc_path = temp_path
|
|
|
122 |
|
123 |
# Ingest the document with retry logic for 403 errors
|
124 |
-
|
|
|
125 |
max_retries = 3
|
126 |
|
127 |
for attempt in range(max_retries):
|
128 |
try:
|
129 |
-
|
|
|
|
|
130 |
break
|
131 |
except Exception as e:
|
132 |
error_str = str(e).lower()
|
133 |
if ("403" in error_str or "forbidden" in error_str or "permission" in error_str) and attempt < max_retries - 1:
|
|
|
134 |
logger.warning(f"Permission error ({attempt+1}/{max_retries}), retrying...")
|
135 |
time.sleep(1.5) # Add delay between retries
|
|
|
|
|
|
|
|
|
|
|
136 |
else:
|
137 |
-
raise # Re-raise
|
138 |
|
139 |
# Clean up the temporary file if different from doc_path
|
140 |
if temp_path != doc_path and os.path.exists(temp_path):
|
@@ -144,19 +183,45 @@ with st.sidebar:
|
|
144 |
except Exception as e:
|
145 |
logger.warning(f"Could not remove temporary file: {e}")
|
146 |
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
except Exception as e:
|
|
|
149 |
logger.error(f"Error processing document: {str(e)}")
|
150 |
-
|
151 |
|
152 |
if "403" in str(e) or "forbidden" in str(e).lower():
|
153 |
st.warning("This appears to be a permissions issue. Try using a different file format or using the text input option instead.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
except Exception as e:
|
155 |
logger.error(f"File uploader error: {str(e)}")
|
156 |
st.error(f"File upload functionality is currently unavailable: {str(e)}")
|
157 |
|
158 |
-
st.
|
159 |
-
|
|
|
160 |
|
161 |
if st.button("Add Text"):
|
162 |
if text_input:
|
@@ -168,13 +233,22 @@ with st.sidebar:
|
|
168 |
"timestamp": str(datetime.now())
|
169 |
}
|
170 |
|
|
|
|
|
|
|
|
|
171 |
# Ingest the text
|
172 |
-
document_processor.ingest_text(text_input, metadata)
|
173 |
|
174 |
-
|
|
|
|
|
|
|
175 |
except Exception as e:
|
176 |
logger.error(f"Error adding text: {str(e)}")
|
177 |
st.error(f"Error adding text: {str(e)}")
|
|
|
|
|
178 |
|
179 |
# Display model information
|
180 |
st.header("Models")
|
|
|
92 |
|
93 |
# Add file uploader with error handling
|
94 |
try:
|
95 |
+
st.subheader("Upload a File")
|
96 |
+
|
97 |
+
# Show supported file types info
|
98 |
+
with st.expander("Supported File Types"):
|
99 |
+
st.markdown("""
|
100 |
+
- **PDF** (.pdf) - Best for formatted documents
|
101 |
+
- **Text** (.txt) - Simple text files
|
102 |
+
- **CSV** (.csv) - Structured data
|
103 |
+
- **Word** (.doc, .docx) - Microsoft Word documents
|
104 |
+
- **Markdown** (.md) - Formatted text
|
105 |
+
- **HTML** (.html, .htm) - Web pages
|
106 |
+
|
107 |
+
Other file types may work but are not fully supported.
|
108 |
+
""")
|
109 |
+
|
110 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "txt", "csv", "doc", "docx", "md", "html", "htm", "xml", "json"])
|
111 |
|
112 |
if uploaded_file is not None:
|
113 |
+
# Display file info
|
114 |
+
file_details = {
|
115 |
+
"Filename": uploaded_file.name,
|
116 |
+
"File size": f"{uploaded_file.size / 1024:.1f} KB",
|
117 |
+
"File type": uploaded_file.type
|
118 |
+
}
|
119 |
+
|
120 |
+
st.json(file_details)
|
121 |
+
|
122 |
# Handle the uploaded file
|
123 |
if st.button("Process Document"):
|
124 |
with st.spinner("Processing document..."):
|
125 |
+
status_placeholder = st.empty()
|
126 |
+
status_placeholder.info("Starting document processing...")
|
127 |
+
|
128 |
try:
|
129 |
# Create a temporary file with proper error handling
|
130 |
+
status_placeholder.info("Creating temporary file...")
|
131 |
temp_dir = tempfile.gettempdir()
|
132 |
temp_path = os.path.join(temp_dir, uploaded_file.name)
|
133 |
|
|
|
138 |
temp_file.write(uploaded_file.getvalue())
|
139 |
|
140 |
# Get a path to store the document permanently
|
141 |
+
status_placeholder.info("Preparing document storage location...")
|
142 |
doc_path = get_document_path(uploaded_file.name)
|
143 |
|
144 |
# Copy the file to the documents directory
|
|
|
148 |
if not copy_success:
|
149 |
logger.warning("Using temporary file path instead of documents directory")
|
150 |
doc_path = temp_path
|
151 |
+
status_placeholder.warning("Using temporary storage (document won't be permanently saved)")
|
152 |
|
153 |
# Ingest the document with retry logic for 403 errors
|
154 |
+
status_placeholder.info("Analyzing and indexing document content...")
|
155 |
+
progress_bar = st.progress(0)
|
156 |
max_retries = 3
|
157 |
|
158 |
for attempt in range(max_retries):
|
159 |
try:
|
160 |
+
progress_bar.progress((attempt * 30) / 100) # Show progress as we attempt
|
161 |
+
ids = document_processor.ingest_file(temp_path, {"original_name": uploaded_file.name})
|
162 |
+
progress_bar.progress(100)
|
163 |
break
|
164 |
except Exception as e:
|
165 |
error_str = str(e).lower()
|
166 |
if ("403" in error_str or "forbidden" in error_str or "permission" in error_str) and attempt < max_retries - 1:
|
167 |
+
status_placeholder.warning(f"Permission error ({attempt+1}/{max_retries}), retrying...")
|
168 |
logger.warning(f"Permission error ({attempt+1}/{max_retries}), retrying...")
|
169 |
time.sleep(1.5) # Add delay between retries
|
170 |
+
elif attempt < max_retries - 1:
|
171 |
+
# General retry for any error
|
172 |
+
status_placeholder.warning(f"Error ({attempt+1}/{max_retries}), retrying...")
|
173 |
+
logger.warning(f"Error during ingestion ({attempt+1}/{max_retries}): {e}")
|
174 |
+
time.sleep(1.5)
|
175 |
else:
|
176 |
+
raise # Re-raise on last attempt
|
177 |
|
178 |
# Clean up the temporary file if different from doc_path
|
179 |
if temp_path != doc_path and os.path.exists(temp_path):
|
|
|
183 |
except Exception as e:
|
184 |
logger.warning(f"Could not remove temporary file: {e}")
|
185 |
|
186 |
+
# Check if ingestion was successful based on IDs
|
187 |
+
if ids and not all(str(id).startswith("error-") for id in ids):
|
188 |
+
status_placeholder.success(f"✅ Document processed successfully!")
|
189 |
+
st.balloons() # Celebrate success
|
190 |
+
else:
|
191 |
+
status_placeholder.warning("⚠️ Document processed with warnings. Some content may not be fully indexed.")
|
192 |
+
|
193 |
except Exception as e:
|
194 |
+
progress_bar = st.progress(100) if 'progress_bar' in locals() else st.progress(0)
|
195 |
logger.error(f"Error processing document: {str(e)}")
|
196 |
+
status_placeholder.error(f"❌ Error processing document: {str(e)}")
|
197 |
|
198 |
if "403" in str(e) or "forbidden" in str(e).lower():
|
199 |
st.warning("This appears to be a permissions issue. Try using a different file format or using the text input option instead.")
|
200 |
+
elif "unsupported" in str(e).lower() or "not supported" in str(e).lower() or "no specific loader" in str(e).lower():
|
201 |
+
st.warning("This file type may not be fully supported. Try converting to PDF or TXT format.")
|
202 |
+
elif "memory" in str(e).lower():
|
203 |
+
st.warning("The file may be too large to process. Try a smaller file or split the content.")
|
204 |
+
elif "timeout" in str(e).lower():
|
205 |
+
st.warning("Processing timed out. Try a smaller file or try again later.")
|
206 |
+
|
207 |
+
# Show troubleshooting tips
|
208 |
+
with st.expander("Troubleshooting Tips"):
|
209 |
+
st.markdown("""
|
210 |
+
- Convert your document to PDF or plain text format
|
211 |
+
- Try a smaller file (under 1MB)
|
212 |
+
- Remove any password protection from the file
|
213 |
+
- Try the text input option below instead
|
214 |
+
- Check if the file contains complex formatting or images
|
215 |
+
""")
|
216 |
+
|
217 |
+
st.markdown("---")
|
218 |
except Exception as e:
|
219 |
logger.error(f"File uploader error: {str(e)}")
|
220 |
st.error(f"File upload functionality is currently unavailable: {str(e)}")
|
221 |
|
222 |
+
st.subheader("Raw Text Input")
|
223 |
+
st.markdown("Alternatively, paste text directly to add to the knowledge base:")
|
224 |
+
text_input = st.text_area("Enter text to add to the knowledge base", height=150)
|
225 |
|
226 |
if st.button("Add Text"):
|
227 |
if text_input:
|
|
|
233 |
"timestamp": str(datetime.now())
|
234 |
}
|
235 |
|
236 |
+
# Ingest the text with progress indication
|
237 |
+
status_text = st.empty()
|
238 |
+
status_text.info("Processing text...")
|
239 |
+
|
240 |
# Ingest the text
|
241 |
+
ids = document_processor.ingest_text(text_input, metadata)
|
242 |
|
243 |
+
if ids and not any(str(id).startswith("error-") for id in ids):
|
244 |
+
status_text.success("✅ Text added to knowledge base successfully!")
|
245 |
+
else:
|
246 |
+
status_text.warning("⚠️ Text processing completed with warnings")
|
247 |
except Exception as e:
|
248 |
logger.error(f"Error adding text: {str(e)}")
|
249 |
st.error(f"Error adding text: {str(e)}")
|
250 |
+
else:
|
251 |
+
st.warning("Please enter some text to add")
|
252 |
|
253 |
# Display model information
|
254 |
st.header("Models")
|