Spaces:

p3rc03
/

2B

Sleeping

37-AN commited on May 14

Commit

28ff371

1 Parent(s): f8ed285

Fix AI responses and file uploading functionality

- Improved AI responses with better prompt formatting and instructions
- Enhanced file upload handling with better error recovery
- Added support for more file types (docx, html, md, etc.)
- Improved UI with progress tracking and better error messages
- Fixed edge cases with empty files and error handling

Files changed (3) hide show

app/core/ingestion.py +107 -24
app/core/memory.py +46 -7
app/ui/streamlit_app.py +84 -10

app/core/ingestion.py CHANGED Viewed

@@ -3,11 +3,14 @@ import sys
 import logging
 import time
 import random
 from typing import List, Dict, Any
 from langchain.document_loaders import (
     PyPDFLoader,
     TextLoader,
-    CSVLoader
 )
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -42,36 +45,86 @@ class DocumentProcessor:
         logger.info(f"Processing file: {file_path} with extension {extension}")
         # Load the file using the appropriate loader
-        if extension == '.pdf':
-            loader = PyPDFLoader(file_path)
-        elif extension == '.txt':
-            loader = TextLoader(file_path)
-        elif extension == '.csv':
-            loader = CSVLoader(file_path)
-        else:
-            raise ValueError(f"Unsupported file type: {extension}")
-        # Load and split the documents
-        documents = loader.load()
-        chunks = self.text_splitter.split_documents(documents)
-        logger.info(f"Split file into {len(chunks)} chunks")
-        return chunks
     def _retry_operation(self, operation, max_retries=3):
         """Retry an operation with exponential backoff."""
         for attempt in range(max_retries):
             try:
                 return operation()
             except Exception as e:
                 if "already accessed by another instance" in str(e) and attempt < max_retries - 1:
                     wait_time = random.uniform(0.5, 2.0) * (attempt + 1)
                     logger.warning(f"Vector store access conflict, retrying ({attempt+1}/{max_retries}) in {wait_time:.2f}s...")
                     time.sleep(wait_time)
                 else:
                     # Different error or last attempt, re-raise
                     raise
     def ingest_file(self, file_path: str, metadata: Dict[str, Any] = None) -> List[str]:
         """Ingest a file into the vector database."""
@@ -86,7 +139,8 @@ class DocumentProcessor:
             # Add file path to metadata
             base_metadata = {
                 "source": file_path,
-                "file_name": os.path.basename(file_path)
             }
             base_metadata.update(metadata)
@@ -99,26 +153,43 @@ class DocumentProcessor:
                 if hasattr(chunk, 'metadata'):
                     chunk_metadata.update(chunk.metadata)
                 chunk_metadata["chunk_id"] = i
                 metadatas.append(chunk_metadata)
             # Store in vector database with retry mechanism
             logger.info(f"Adding {len(texts)} chunks to vector database")
             def add_to_vectordb():
                 return self.memory_manager.add_texts(texts, metadatas)
-            ids = self._retry_operation(add_to_vectordb)
-            logger.info(f"Successfully added chunks with IDs: {ids[:3]}...")
             return ids
         except Exception as e:
             logger.error(f"Error ingesting file {file_path}: {str(e)}")
             # Return placeholder IDs if there's an error
-            return [f"error-{random.randint(1000, 9999)}" for _ in range(len(chunks) if 'chunks' in locals() else 1)]
     def ingest_text(self, text: str, metadata: Dict[str, Any] = None) -> List[str]:
         """Ingest raw text into the vector database."""
         try:
             if metadata is None:
                 metadata = {}
@@ -126,23 +197,35 @@ class DocumentProcessor:
             chunks = self.text_splitter.split_text(text)
             logger.info(f"Split text into {len(chunks)} chunks")
             # Prepare metadatas
             metadatas = []
             for i in range(len(chunks)):
                 chunk_metadata = metadata.copy()
                 chunk_metadata["chunk_id"] = i
                 chunk_metadata["source"] = "direct_input"
                 metadatas.append(chunk_metadata)
             # Store in vector database with retry mechanism
             def add_to_vectordb():
                 return self.memory_manager.add_texts(chunks, metadatas)
-            ids = self._retry_operation(add_to_vectordb)
-            logger.info(f"Successfully added text chunks with IDs: {ids[:3] if len(ids) > 3 else ids}...")
             return ids
         except Exception as e:
             logger.error(f"Error ingesting text: {str(e)}")
             # Return placeholder IDs if there's an error
-            return [f"error-{random.randint(1000, 9999)}" for _ in range(len(chunks) if 'chunks' in locals() else 1)]

 import logging
 import time
 import random
+import traceback
 from typing import List, Dict, Any
 from langchain.document_loaders import (
     PyPDFLoader,
     TextLoader,
+    CSVLoader,
+    UnstructuredFileLoader,
+    Docx2txtLoader
 )
 from langchain.text_splitter import RecursiveCharacterTextSplitter
         logger.info(f"Processing file: {file_path} with extension {extension}")
+        # Verify file is readable
+        try:
+            with open(file_path, 'rb') as f:
+                # Just check if we can read from it
+                f.read(1)
+        except Exception as e:
+            logger.error(f"Cannot read file {file_path}: {e}")
+            raise IOError(f"File {file_path} exists but cannot be read: {str(e)}")
         # Load the file using the appropriate loader
+        try:
+            if extension == '.pdf':
+                loader = PyPDFLoader(file_path)
+            elif extension == '.txt':
+                loader = TextLoader(file_path)
+            elif extension == '.csv':
+                loader = CSVLoader(file_path)
+            elif extension in ['.doc', '.docx']:
+                loader = Docx2txtLoader(file_path)
+            elif extension in ['.md', '.html', '.htm', '.xml', '.json']:
+                # Dedicated loaders could be added for these formats
+                loader = TextLoader(file_path)
+            else:
+                # Try generic loader as fallback for unsupported types
+                logger.warning(f"No specific loader for {extension}, trying UnstructuredFileLoader")
+                loader = UnstructuredFileLoader(file_path)
+            # Load and split the documents
+            documents = loader.load()
+            if not documents:
+                logger.warning(f"No content extracted from {file_path}")
+                # Create a minimal document if empty to avoid errors
+                from langchain.schema import Document
+                documents = [Document(page_content=f"Empty file: {os.path.basename(file_path)}",
+                                    metadata={"source": file_path})]
+            chunks = self.text_splitter.split_documents(documents)
+            logger.info(f"Split file into {len(chunks)} chunks")
+            return chunks
+        except Exception as e:
+            logger.error(f"Error in document processing: {str(e)}")
+            logger.error(traceback.format_exc())
+            # Create a minimal document to represent the error
+            from langchain.schema import Document
+            error_doc = Document(
+                page_content=f"Error processing file {os.path.basename(file_path)}: {str(e)}",
+                metadata={"source": file_path, "error": str(e)}
+            )
+            return [error_doc]
     def _retry_operation(self, operation, max_retries=3):
         """Retry an operation with exponential backoff."""
+        last_exception = None
         for attempt in range(max_retries):
             try:
                 return operation()
             except Exception as e:
+                last_exception = e
                 if "already accessed by another instance" in str(e) and attempt < max_retries - 1:
                     wait_time = random.uniform(0.5, 2.0) * (attempt + 1)
                     logger.warning(f"Vector store access conflict, retrying ({attempt+1}/{max_retries}) in {wait_time:.2f}s...")
                     time.sleep(wait_time)
+                elif attempt < max_retries - 1:
+                    # For other errors, also retry but with different message
+                    wait_time = random.uniform(0.5, 2.0) * (attempt + 1)
+                    logger.warning(f"Operation failed ({str(e)}), retrying ({attempt+1}/{max_retries}) in {wait_time:.2f}s...")
+                    time.sleep(wait_time)
                 else:
                     # Different error or last attempt, re-raise
                     raise
+        # If we get here with a last_exception, re-raise it
+        if last_exception:
+            raise last_exception
+        else:
+            raise RuntimeError("Retry operation failed but no exception was captured")
     def ingest_file(self, file_path: str, metadata: Dict[str, Any] = None) -> List[str]:
         """Ingest a file into the vector database."""
             # Add file path to metadata
             base_metadata = {
                 "source": file_path,
+                "file_name": os.path.basename(file_path),
+                "ingestion_time": time.strftime("%Y-%m-%d %H:%M:%S")
             }
             base_metadata.update(metadata)
                 if hasattr(chunk, 'metadata'):
                     chunk_metadata.update(chunk.metadata)
                 chunk_metadata["chunk_id"] = i
+                chunk_metadata["total_chunks"] = len(chunks)
                 metadatas.append(chunk_metadata)
             # Store in vector database with retry mechanism
             logger.info(f"Adding {len(texts)} chunks to vector database")
+            # Handle empty texts to avoid errors
+            if not texts:
+                logger.warning("No text chunks extracted from file, adding placeholder")
+                texts = [f"Empty file: {os.path.basename(file_path)}"]
+                metadatas = [{"source": file_path, "file_name": os.path.basename(file_path), "empty_file": True}]
             def add_to_vectordb():
                 return self.memory_manager.add_texts(texts, metadatas)
+            try:
+                ids = self._retry_operation(add_to_vectordb)
+                logger.info(f"Successfully added chunks with IDs: {ids[:3] if len(ids) > 3 else ids}...")
+            except Exception as e:
+                logger.error(f"All attempts to add to vector DB failed: {e}")
+                # Return placeholder IDs
+                ids = [f"error-{random.randint(1000, 9999)}" for _ in range(len(texts))]
             return ids
         except Exception as e:
             logger.error(f"Error ingesting file {file_path}: {str(e)}")
+            logger.error(traceback.format_exc())
             # Return placeholder IDs if there's an error
+            return [f"error-{random.randint(1000, 9999)}"]
     def ingest_text(self, text: str, metadata: Dict[str, Any] = None) -> List[str]:
         """Ingest raw text into the vector database."""
         try:
+            if not text.strip():
+                logger.warning("Empty text provided for ingestion")
+                return ["empty-text-error"]
             if metadata is None:
                 metadata = {}
             chunks = self.text_splitter.split_text(text)
             logger.info(f"Split text into {len(chunks)} chunks")
+            # If text splitting produced no chunks (unusual), create one
+            if not chunks:
+                chunks = ["Empty text input"]
             # Prepare metadatas
             metadatas = []
             for i in range(len(chunks)):
                 chunk_metadata = metadata.copy()
                 chunk_metadata["chunk_id"] = i
+                chunk_metadata["total_chunks"] = len(chunks)
                 chunk_metadata["source"] = "direct_input"
+                chunk_metadata["ingestion_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
                 metadatas.append(chunk_metadata)
             # Store in vector database with retry mechanism
             def add_to_vectordb():
                 return self.memory_manager.add_texts(chunks, metadatas)
+            try:
+                ids = self._retry_operation(add_to_vectordb)
+                logger.info(f"Successfully added text chunks with IDs: {ids[:3] if len(ids) > 3 else ids}...")
+            except Exception as e:
+                logger.error(f"All attempts to add text to vector DB failed: {e}")
+                # Return placeholder IDs
+                ids = [f"error-{random.randint(1000, 9999)}" for _ in range(len(chunks))]
             return ids
         except Exception as e:
             logger.error(f"Error ingesting text: {str(e)}")
+            logger.error(traceback.format_exc())
             # Return placeholder IDs if there's an error
+            return [f"error-{random.randint(1000, 9999)}"]

app/core/memory.py CHANGED Viewed

@@ -166,15 +166,28 @@ class MemoryManager:
                     relevant_docs = retriever.get_relevant_documents(question)
                     # Format the context from relevant documents
-                    context = "\n\n".join([doc.page_content for doc in relevant_docs])
                     # Get chat history from memory
                     chat_history = self.memory.chat_memory.messages
                     chat_history_str = "\n".join([f"{msg.type}: {msg.content}" for msg in chat_history])
-                    # Create the prompt
-                    prompt = f"""You are a helpful AI assistant. Answer the following question based on the provided context.
 Context:
 {context}
@@ -184,8 +197,34 @@ Chat History:
 Question: {question}
 Answer:"""
-                    # Get the answer from the LLM
-                    answer = self.llm(prompt)
                     return {
                         "answer": answer,
@@ -194,7 +233,7 @@ Answer:"""
                 except Exception as e:
                     logger.error(f"Error in simple_chain: {e}")
                     return {
-                        "answer": f"I encountered an error: {str(e)}",
                         "sources": []
                     }

                     relevant_docs = retriever.get_relevant_documents(question)
                     # Format the context from relevant documents
+                    context_parts = []
+                    for i, doc in enumerate(relevant_docs):
+                        source_name = doc.metadata.get("file_name", "Unknown Source")
+                        context_parts.append(f"Document {i+1} [{source_name}]:\n{doc.page_content}\n")
+                    context = "\n".join(context_parts) if context_parts else "No relevant documents found."
                     # Get chat history from memory
                     chat_history = self.memory.chat_memory.messages
                     chat_history_str = "\n".join([f"{msg.type}: {msg.content}" for msg in chat_history])
+                    # Create the improved prompt with better instructions
+                    prompt = f"""You are a helpful, accurate, and precise AI assistant. Answer the following question based on the provided context.
+Follow these guidelines when responding:
+1. If the context contains relevant information, use it to provide a direct and specific answer.
+2. Format your answer in clear, concise paragraphs with appropriate spacing.
+3. If the answer is not in the context, acknowledge this and provide a general response based on your knowledge.
+4. Do not mention "context" or "documents" in your answer - integrate the information naturally.
+5. Keep answers factual, helpful, and to the point.
+6. Never make up information that isn't supported by the context.
 Context:
 {context}
 Question: {question}
 Answer:"""
+                    # Get the answer from the LLM with a timeout and retries
+                    try:
+                        answer = self.llm(prompt)
+                        # Simple quality check - if too short or generic, try again
+                        if len(answer.strip()) < 20 or "I don't have enough information" in answer:
+                            logger.info("Answer quality check failed, retrying with modified prompt")
+                            # Add a more specific instruction to the prompt
+                            enhanced_prompt = prompt + "\n\nPlease be as helpful as possible with the information available."
+                            second_attempt = self.llm(enhanced_prompt)
+                            # Use the better of the two responses
+                            if len(second_attempt.strip()) > len(answer.strip()):
+                                answer = second_attempt
+                    except Exception as llm_error:
+                        logger.error(f"Error getting answer from LLM: {llm_error}")
+                        if not answer:  # If answer wasn't set due to first attempt exception
+                            answer = f"I'm having trouble generating a response right now. Please try again in a moment."
+                    # Perform basic formatting cleanup
+                    answer = answer.strip()
+                    # Remove common prefixes that models sometimes add
+                    prefixes_to_remove = ["Answer:", "AI:", "Assistant:"]
+                    for prefix in prefixes_to_remove:
+                        if answer.startswith(prefix):
+                            answer = answer[len(prefix):].strip()
                     return {
                         "answer": answer,
                 except Exception as e:
                     logger.error(f"Error in simple_chain: {e}")
                     return {
+                        "answer": f"I encountered an error while processing your question. Please try again with a different query.",
                         "sources": []
                     }

app/ui/streamlit_app.py CHANGED Viewed

@@ -92,14 +92,42 @@ with st.sidebar:
     # Add file uploader with error handling
     try:
-        uploaded_file = st.file_uploader("Choose a file", type=["pdf", "txt", "csv"])
         if uploaded_file is not None:
             # Handle the uploaded file
             if st.button("Process Document"):
                 with st.spinner("Processing document..."):
                     try:
                         # Create a temporary file with proper error handling
                         temp_dir = tempfile.gettempdir()
                         temp_path = os.path.join(temp_dir, uploaded_file.name)
@@ -110,6 +138,7 @@ with st.sidebar:
                             temp_file.write(uploaded_file.getvalue())
                         # Get a path to store the document permanently
                         doc_path = get_document_path(uploaded_file.name)
                         # Copy the file to the documents directory
@@ -119,22 +148,32 @@ with st.sidebar:
                         if not copy_success:
                             logger.warning("Using temporary file path instead of documents directory")
                             doc_path = temp_path
                         # Ingest the document with retry logic for 403 errors
-                        logger.info("Ingesting document")
                         max_retries = 3
                         for attempt in range(max_retries):
                             try:
-                                document_processor.ingest_file(temp_path, {"original_name": uploaded_file.name})
                                 break
                             except Exception as e:
                                 error_str = str(e).lower()
                                 if ("403" in error_str or "forbidden" in error_str or "permission" in error_str) and attempt < max_retries - 1:
                                     logger.warning(f"Permission error ({attempt+1}/{max_retries}), retrying...")
                                     time.sleep(1.5)  # Add delay between retries
                                 else:
-                                    raise  # Re-raise if not a 403 error or on last attempt
                         # Clean up the temporary file if different from doc_path
                         if temp_path != doc_path and os.path.exists(temp_path):
@@ -144,19 +183,45 @@ with st.sidebar:
                             except Exception as e:
                                 logger.warning(f"Could not remove temporary file: {e}")
-                        st.success(f"Document {uploaded_file.name} processed successfully!")
                     except Exception as e:
                         logger.error(f"Error processing document: {str(e)}")
-                        st.error(f"Error processing document: {str(e)}")
                         if "403" in str(e) or "forbidden" in str(e).lower():
                             st.warning("This appears to be a permissions issue. Try using a different file format or using the text input option instead.")
     except Exception as e:
         logger.error(f"File uploader error: {str(e)}")
         st.error(f"File upload functionality is currently unavailable: {str(e)}")
-    st.header("Raw Text Input")
-    text_input = st.text_area("Enter text to add to the knowledge base")
     if st.button("Add Text"):
         if text_input:
@@ -168,13 +233,22 @@ with st.sidebar:
                         "timestamp": str(datetime.now())
                     }
                     # Ingest the text
-                    document_processor.ingest_text(text_input, metadata)
-                    st.success("Text added to knowledge base successfully!")
                 except Exception as e:
                     logger.error(f"Error adding text: {str(e)}")
                     st.error(f"Error adding text: {str(e)}")
     # Display model information
     st.header("Models")

     # Add file uploader with error handling
     try:
+        st.subheader("Upload a File")
+        # Show supported file types info
+        with st.expander("Supported File Types"):
+            st.markdown("""
+            - **PDF** (.pdf) - Best for formatted documents
+            - **Text** (.txt) - Simple text files
+            - **CSV** (.csv) - Structured data
+            - **Word** (.doc, .docx) - Microsoft Word documents
+            - **Markdown** (.md) - Formatted text
+            - **HTML** (.html, .htm) - Web pages
+            Other file types may work but are not fully supported.
+            """)
+        uploaded_file = st.file_uploader("Choose a file", type=["pdf", "txt", "csv", "doc", "docx", "md", "html", "htm", "xml", "json"])
         if uploaded_file is not None:
+            # Display file info
+            file_details = {
+                "Filename": uploaded_file.name,
+                "File size": f"{uploaded_file.size / 1024:.1f} KB",
+                "File type": uploaded_file.type
+            }
+            st.json(file_details)
             # Handle the uploaded file
             if st.button("Process Document"):
                 with st.spinner("Processing document..."):
+                    status_placeholder = st.empty()
+                    status_placeholder.info("Starting document processing...")
                     try:
                         # Create a temporary file with proper error handling
+                        status_placeholder.info("Creating temporary file...")
                         temp_dir = tempfile.gettempdir()
                         temp_path = os.path.join(temp_dir, uploaded_file.name)
                             temp_file.write(uploaded_file.getvalue())
                         # Get a path to store the document permanently
+                        status_placeholder.info("Preparing document storage location...")
                         doc_path = get_document_path(uploaded_file.name)
                         # Copy the file to the documents directory
                         if not copy_success:
                             logger.warning("Using temporary file path instead of documents directory")
                             doc_path = temp_path
+                            status_placeholder.warning("Using temporary storage (document won't be permanently saved)")
                         # Ingest the document with retry logic for 403 errors
+                        status_placeholder.info("Analyzing and indexing document content...")
+                        progress_bar = st.progress(0)
                         max_retries = 3
                         for attempt in range(max_retries):
                             try:
+                                progress_bar.progress((attempt * 30) / 100)  # Show progress as we attempt
+                                ids = document_processor.ingest_file(temp_path, {"original_name": uploaded_file.name})
+                                progress_bar.progress(100)
                                 break
                             except Exception as e:
                                 error_str = str(e).lower()
                                 if ("403" in error_str or "forbidden" in error_str or "permission" in error_str) and attempt < max_retries - 1:
+                                    status_placeholder.warning(f"Permission error ({attempt+1}/{max_retries}), retrying...")
                                     logger.warning(f"Permission error ({attempt+1}/{max_retries}), retrying...")
                                     time.sleep(1.5)  # Add delay between retries
+                                elif attempt < max_retries - 1:
+                                    # General retry for any error
+                                    status_placeholder.warning(f"Error ({attempt+1}/{max_retries}), retrying...")
+                                    logger.warning(f"Error during ingestion ({attempt+1}/{max_retries}): {e}")
+                                    time.sleep(1.5)
                                 else:
+                                    raise  # Re-raise on last attempt
                         # Clean up the temporary file if different from doc_path
                         if temp_path != doc_path and os.path.exists(temp_path):
                             except Exception as e:
                                 logger.warning(f"Could not remove temporary file: {e}")
+                        # Check if ingestion was successful based on IDs
+                        if ids and not all(str(id).startswith("error-") for id in ids):
+                            status_placeholder.success(f"✅ Document processed successfully!")
+                            st.balloons()  # Celebrate success
+                        else:
+                            status_placeholder.warning("⚠️ Document processed with warnings. Some content may not be fully indexed.")
                     except Exception as e:
+                        progress_bar = st.progress(100) if 'progress_bar' in locals() else st.progress(0)
                         logger.error(f"Error processing document: {str(e)}")
+                        status_placeholder.error(f"❌ Error processing document: {str(e)}")
                         if "403" in str(e) or "forbidden" in str(e).lower():
                             st.warning("This appears to be a permissions issue. Try using a different file format or using the text input option instead.")
+                        elif "unsupported" in str(e).lower() or "not supported" in str(e).lower() or "no specific loader" in str(e).lower():
+                            st.warning("This file type may not be fully supported. Try converting to PDF or TXT format.")
+                        elif "memory" in str(e).lower():
+                            st.warning("The file may be too large to process. Try a smaller file or split the content.")
+                        elif "timeout" in str(e).lower():
+                            st.warning("Processing timed out. Try a smaller file or try again later.")
+                        # Show troubleshooting tips
+                        with st.expander("Troubleshooting Tips"):
+                            st.markdown("""
+                            - Convert your document to PDF or plain text format
+                            - Try a smaller file (under 1MB)
+                            - Remove any password protection from the file
+                            - Try the text input option below instead
+                            - Check if the file contains complex formatting or images
+                            """)
+        st.markdown("---")
     except Exception as e:
         logger.error(f"File uploader error: {str(e)}")
         st.error(f"File upload functionality is currently unavailable: {str(e)}")
+    st.subheader("Raw Text Input")
+    st.markdown("Alternatively, paste text directly to add to the knowledge base:")
+    text_input = st.text_area("Enter text to add to the knowledge base", height=150)
     if st.button("Add Text"):
         if text_input:
                         "timestamp": str(datetime.now())
                     }
+                    # Ingest the text with progress indication
+                    status_text = st.empty()
+                    status_text.info("Processing text...")
                     # Ingest the text
+                    ids = document_processor.ingest_text(text_input, metadata)
+                    if ids and not any(str(id).startswith("error-") for id in ids):
+                        status_text.success("✅ Text added to knowledge base successfully!")
+                    else:
+                        status_text.warning("⚠️ Text processing completed with warnings")
                 except Exception as e:
                     logger.error(f"Error adding text: {str(e)}")
                     st.error(f"Error adding text: {str(e)}")
+        else:
+            st.warning("Please enter some text to add")
     # Display model information
     st.header("Models")