Spaces:

p3rc03
/

2B

Running

App Files Files Community

37-AN commited on May 14

Commit

f8ed285

1 Parent(s): b725ad2

Fix 403 errors by improving model loading and error handling

Browse files

Files changed (5) hide show

app/config.py +3 -2
app/core/llm.py +123 -40
app/core/memory.py +73 -23
app/ui/streamlit_app.py +17 -2
fix_403_error.py +97 -0

app/config.py CHANGED Viewed

@@ -11,7 +11,8 @@ load_dotenv(dotenv_path=env_path)
 HF_API_KEY = os.getenv('HF_API_KEY', '')
 # LLM Configuration
-LLM_MODEL = os.getenv('LLM_MODEL', 'distilgpt2')
 EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
 # Vector Database
@@ -45,7 +46,7 @@ def create_env_example():
 HF_API_KEY=your_huggingface_api_key_here
 # LLM Configuration
-LLM_MODEL=distilgpt2  # Use small model for Hugging Face Spaces
 EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
 # Vector Database

 HF_API_KEY = os.getenv('HF_API_KEY', '')
 # LLM Configuration
+# Use models that are freely accessible and don't require authentication
+LLM_MODEL = os.getenv('LLM_MODEL', 'gpt2')  # Changed from distilgpt2 to gpt2
 EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
 # Vector Database
 HF_API_KEY=your_huggingface_api_key_here
 # LLM Configuration
+LLM_MODEL=gpt2  # Use small model for Hugging Face Spaces
 EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
 # Vector Database

app/core/llm.py CHANGED Viewed

@@ -37,38 +37,89 @@ def get_llm():
         logger.info(f"Loading model {LLM_MODEL} as local pipeline")
-        # Try loading with more specific model classes for better compatibility
-        try:
-            # Load tokenizer and model explicitly
-            tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
-            model = AutoModelForCausalLM.from_pretrained(LLM_MODEL)
-            # Create pipeline with loaded components
-            pipe = pipeline(
-                "text-generation",
-                model=model,
-                tokenizer=tokenizer,
-                max_length=MAX_TOKENS,
-                temperature=DEFAULT_TEMPERATURE
-            )
-            return HuggingFacePipeline(pipeline=pipe)
-        except Exception as e:
-            logger.warning(f"Error loading with explicit model/tokenizer: {e}")
-            # Fallback to simpler pipeline instantiation
-            pipe = pipeline(
-                "text-generation",
-                model=LLM_MODEL,
-                max_length=MAX_TOKENS,
-                temperature=DEFAULT_TEMPERATURE
-            )
-            return HuggingFacePipeline(pipeline=pipe)
     except Exception as e:
         logger.warning(f"Error creating local pipeline: {e}")
         # Last resort - mock LLM for fallback
         from langchain.llms.fake import FakeListLLM
         logger.warning("Using mock LLM as fallback")
@@ -92,20 +143,52 @@ def get_embeddings():
             logger.warning(f"Could not create cache directory: {e}")
             cache_dir = None
-    # Try to use local embeddings
     try:
-        logger.info(f"Loading embeddings model: {EMBEDDING_MODEL}")
-        return HuggingFaceEmbeddings(
-            model_name=EMBEDDING_MODEL,
-            cache_folder=cache_dir
-        )
-    except Exception as e:
-        logger.warning(f"Error initializing embeddings: {e}")
-        # Create mock embeddings that return random vectors for fallback
-        from langchain.embeddings.fake import FakeEmbeddings
-        logger.warning("Using mock embeddings as fallback")
-        return FakeEmbeddings(size=384)  # Standard size for small embedding models
 def get_chat_model():
     """

         logger.info(f"Loading model {LLM_MODEL} as local pipeline")
+        # Try multiple fallbacks with increasingly simpler models
+        models_to_try = [
+            LLM_MODEL,
+            "distilgpt2",  # Smaller fallback
+            "gpt2",        # Standard fallback
+            "EleutherAI/gpt-neo-125M"  # Another option
+        ]
+        last_error = None
+        for model_name in models_to_try:
+            try:
+                logger.info(f"Attempting to load model: {model_name}")
+                # Try with explicit loading first
+                try:
+                    # Set trust_remote_code to False to avoid security issues
+                    tokenizer = AutoTokenizer.from_pretrained(
+                        model_name,
+                        use_auth_token=api_key if api_key else None,
+                        trust_remote_code=False
+                    )
+                    model = AutoModelForCausalLM.from_pretrained(
+                        model_name,
+                        use_auth_token=api_key if api_key else None,
+                        trust_remote_code=False,
+                        low_cpu_mem_usage=True  # Help with memory issues
+                    )
+                    # Create pipeline with loaded components
+                    pipe = pipeline(
+                        "text-generation",
+                        model=model,
+                        tokenizer=tokenizer,
+                        max_length=MAX_TOKENS,
+                        temperature=DEFAULT_TEMPERATURE,
+                        device=-1  # Use CPU
+                    )
+                    logger.info(f"Successfully loaded model: {model_name}")
+                    return HuggingFacePipeline(pipeline=pipe)
+                except Exception as e:
+                    logger.warning(f"Error loading {model_name} with explicit model/tokenizer: {e}")
+                    last_error = e
+                    # Try direct pipeline loading
+                    pipe = pipeline(
+                        "text-generation",
+                        model=model_name,
+                        max_length=MAX_TOKENS,
+                        temperature=DEFAULT_TEMPERATURE,
+                        use_auth_token=api_key if api_key else None,
+                        device=-1  # Use CPU
+                    )
+                    logger.info(f"Successfully loaded model: {model_name} via direct pipeline")
+                    return HuggingFacePipeline(pipeline=pipe)
+            except Exception as e:
+                logger.warning(f"Error loading model {model_name}: {e}")
+                last_error = e
+                # Continue to the next model
+                continue
+        # If we get here, all models failed
+        logger.error(f"All models failed to load. Last error: {last_error}")
+        raise last_error
     except Exception as e:
         logger.warning(f"Error creating local pipeline: {e}")
+        # Try the HuggingFaceEndpoint as fallback
+        try:
+            logger.info("Attempting to use HuggingFaceEndpoint")
+            return HuggingFaceEndpoint(
+                repo_id="gpt2",
+                max_length=MAX_TOKENS,
+                temperature=DEFAULT_TEMPERATURE,
+                huggingfacehub_api_token=api_key
+            )
+        except Exception as endpoint_error:
+            logger.warning(f"HuggingFaceEndpoint failed: {endpoint_error}")
         # Last resort - mock LLM for fallback
         from langchain.llms.fake import FakeListLLM
         logger.warning("Using mock LLM as fallback")
             logger.warning(f"Could not create cache directory: {e}")
             cache_dir = None
+    # Try multiple models with fallbacks
+    embedding_models_to_try = [
+        EMBEDDING_MODEL,
+        "sentence-transformers/all-MiniLM-L6-v2",  # Standard model
+        "sentence-transformers/paraphrase-MiniLM-L3-v2",  # Smaller model
+        "sentence-transformers/paraphrase-albert-small-v2"  # Even smaller model
+    ]
+    api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "")
+    for model_name in embedding_models_to_try:
+        # Try to use local embeddings
+        try:
+            logger.info(f"Loading embeddings model: {model_name}")
+            return HuggingFaceEmbeddings(
+                model_name=model_name,
+                cache_folder=cache_dir,
+                encode_kwargs={"normalize_embeddings": True},
+                model_kwargs={"device": "cpu"}  # Ensure using CPU
+            )
+        except Exception as e:
+            logger.warning(f"Error initializing embeddings with {model_name}: {e}")
+            # Continue to the next model
+    # If all models fail, try with direct transformers access
     try:
+        from sentence_transformers import SentenceTransformer
+        logger.info("Loading embeddings with SentenceTransformer directly")
+        model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
+        # Create a custom embeddings class
+        class DirectEmbeddings:
+            def embed_documents(self, texts):
+                return model.encode(texts, normalize_embeddings=True).tolist()
+            def embed_query(self, text):
+                return model.encode(text, normalize_embeddings=True).tolist()
+        return DirectEmbeddings()
+    except Exception as e:
+        logger.warning(f"Error with direct SentenceTransformer: {e}")
+    # Create mock embeddings as last resort
+    from langchain.embeddings.fake import FakeEmbeddings
+    logger.warning("Using mock embeddings as fallback")
+    return FakeEmbeddings(size=384)  # Standard size for small embedding models
 def get_chat_model():
     """

app/core/memory.py CHANGED Viewed

@@ -8,6 +8,8 @@ from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -18,6 +20,34 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(
 from app.config import VECTOR_DB_PATH, COLLECTION_NAME
 from app.core.llm import get_llm, get_embeddings, get_chat_model
 class MemoryManager:
     """Manages the RAG memory system using a vector database."""
@@ -117,38 +147,58 @@ class MemoryManager:
     def create_rag_chain(self):
         """Create a RAG chain for question answering."""
         try:
-            # Configure correct return keys to match what agent.py expects
-            logger.info("Creating ConversationalRetrievalChain")
-            chain = ConversationalRetrievalChain.from_llm(
-                llm=self.llm,
-                retriever=self.get_retriever(),
-                memory=self.memory,
-                return_source_documents=True,
-                return_generated_question=False,
-            )
-            # Create a wrapper function that normalizes the chain output format
-            def normalized_chain(inputs):
-                logger.info("Executing RAG chain with normalizer")
                 try:
-                    # Execute the original chain
-                    response = chain(inputs)
-                    logger.info(f"Original chain output keys: {list(response.keys())}")
-                    # Create a normalized response
-                    normalized = {
-                        "answer": response.get("answer", "No answer generated"),
-                        "sources": response.get("source_documents", [])
                     }
-                    return normalized
                 except Exception as e:
-                    logger.error(f"Error in normalized chain: {e}")
                     return {
-                        "answer": f"Error processing your query: {str(e)}",
                         "sources": []
                     }
-            return normalized_chain
         except Exception as e:
             logger.error(f"Error creating RAG chain: {e}")

 from langchain.memory import ConversationBufferMemory
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams
+from langchain.chains.base import Chain
+from typing import Dict, List, Any
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 from app.config import VECTOR_DB_PATH, COLLECTION_NAME
 from app.core.llm import get_llm, get_embeddings, get_chat_model
+class CustomRAGChain:
+    """Custom RAG chain that always returns standardized output format."""
+    def __init__(self, base_chain):
+        self.base_chain = base_chain
+        logger.info("CustomRAGChain initialized")
+    def __call__(self, inputs):
+        """Process inputs and return standardized output."""
+        try:
+            logger.info("CustomRAGChain processing query")
+            # Execute the underlying chain
+            result = self.base_chain(inputs)
+            logger.info(f"Base chain returned keys: {list(result.keys())}")
+            # Create standardized output
+            standardized = {
+                "answer": result.get("answer", "I couldn't generate an answer."),
+                "sources": result.get("source_documents", [])
+            }
+            return standardized
+        except Exception as e:
+            logger.error(f"Error in CustomRAGChain: {e}")
+            return {
+                "answer": f"Error processing query: {str(e)}",
+                "sources": []
+            }
 class MemoryManager:
     """Manages the RAG memory system using a vector database."""
     def create_rag_chain(self):
         """Create a RAG chain for question answering."""
         try:
+            # Create the base conversational retrieval chain
+            logger.info("Creating base ConversationalRetrievalChain")
+            # Different approach: create a simple function instead
+            def simple_chain(query_dict):
                 try:
+                    # Extract the question
+                    question = query_dict.get("question", "")
+                    if not question.strip():
+                        return {
+                            "answer": "No question provided.",
+                            "sources": []
+                        }
+                    # Get relevant documents from the retriever
+                    retriever = self.get_retriever()
+                    relevant_docs = retriever.get_relevant_documents(question)
+                    # Format the context from relevant documents
+                    context = "\n\n".join([doc.page_content for doc in relevant_docs])
+                    # Get chat history from memory
+                    chat_history = self.memory.chat_memory.messages
+                    chat_history_str = "\n".join([f"{msg.type}: {msg.content}" for msg in chat_history])
+                    # Create the prompt
+                    prompt = f"""You are a helpful AI assistant. Answer the following question based on the provided context.
+Context:
+{context}
+Chat History:
+{chat_history_str}
+Question: {question}
+Answer:"""
+                    # Get the answer from the LLM
+                    answer = self.llm(prompt)
+                    return {
+                        "answer": answer,
+                        "sources": relevant_docs
                     }
                 except Exception as e:
+                    logger.error(f"Error in simple_chain: {e}")
                     return {
+                        "answer": f"I encountered an error: {str(e)}",
                         "sources": []
                     }
+            return simple_chain
         except Exception as e:
             logger.error(f"Error creating RAG chain: {e}")

app/ui/streamlit_app.py CHANGED Viewed

@@ -120,9 +120,21 @@ with st.sidebar:
                             logger.warning("Using temporary file path instead of documents directory")
                             doc_path = temp_path
-                        # Ingest the document
                         logger.info("Ingesting document")
-                        document_processor.ingest_file(temp_path, {"original_name": uploaded_file.name})
                         # Clean up the temporary file if different from doc_path
                         if temp_path != doc_path and os.path.exists(temp_path):
@@ -136,6 +148,9 @@ with st.sidebar:
                     except Exception as e:
                         logger.error(f"Error processing document: {str(e)}")
                         st.error(f"Error processing document: {str(e)}")
     except Exception as e:
         logger.error(f"File uploader error: {str(e)}")
         st.error(f"File upload functionality is currently unavailable: {str(e)}")

                             logger.warning("Using temporary file path instead of documents directory")
                             doc_path = temp_path
+                        # Ingest the document with retry logic for 403 errors
                         logger.info("Ingesting document")
+                        max_retries = 3
+                        for attempt in range(max_retries):
+                            try:
+                                document_processor.ingest_file(temp_path, {"original_name": uploaded_file.name})
+                                break
+                            except Exception as e:
+                                error_str = str(e).lower()
+                                if ("403" in error_str or "forbidden" in error_str or "permission" in error_str) and attempt < max_retries - 1:
+                                    logger.warning(f"Permission error ({attempt+1}/{max_retries}), retrying...")
+                                    time.sleep(1.5)  # Add delay between retries
+                                else:
+                                    raise  # Re-raise if not a 403 error or on last attempt
                         # Clean up the temporary file if different from doc_path
                         if temp_path != doc_path and os.path.exists(temp_path):
                     except Exception as e:
                         logger.error(f"Error processing document: {str(e)}")
                         st.error(f"Error processing document: {str(e)}")
+                        if "403" in str(e) or "forbidden" in str(e).lower():
+                            st.warning("This appears to be a permissions issue. Try using a different file format or using the text input option instead.")
     except Exception as e:
         logger.error(f"File uploader error: {str(e)}")
         st.error(f"File upload functionality is currently unavailable: {str(e)}")

fix_403_error.py ADDED Viewed

	@@ -0,0 +1,97 @@

+#!/usr/bin/env python
+"""
+Script to fix 403 errors and push changes to Hugging Face Spaces
+"""
+import os
+import subprocess
+import sys
+from getpass import getpass
+def fix_403_errors():
+    """Update the app to fix 403 errors and push to Hugging Face Space."""
+    print("=" * 50)
+    print("Fix 403 Errors and Push to Hugging Face")
+    print("=" * 50)
+    # Get credentials
+    username = input("Enter your Hugging Face username: ")
+    token = getpass("Enter your Hugging Face token: ")
+    space_name = input("Enter your Space name: ")
+    # Set environment variables
+    os.environ["HUGGINGFACEHUB_API_TOKEN"] = token
+    os.environ["HF_API_KEY"] = token
+    # Add the direct remote URL with credentials embedded
+    remote_url = f"https://{username}:{token}@huggingface.co/spaces/{username}/{space_name}"
+    try:
+        # Update git remotes
+        remotes = subprocess.run(["git", "remote"], capture_output=True, text=True).stdout.strip().split('\n')
+        if "hf" not in remotes:
+            subprocess.run(["git", "remote", "add", "hf", remote_url], check=True)
+        else:
+            subprocess.run(["git", "remote", "set-url", "hf", remote_url], check=True)
+        # Pull the latest changes first to avoid conflicts
+        try:
+            subprocess.run(["git", "pull", "hf", "main"], check=True)
+            print("Successfully pulled latest changes")
+        except subprocess.CalledProcessError:
+            print("Warning: Could not pull latest changes. Will attempt to push anyway.")
+        # Stage all files
+        subprocess.run(["git", "add", "."], check=True)
+        # Commit changes
+        try:
+            subprocess.run(["git", "commit", "-m", "Fix 403 errors by improving model loading and error handling"], check=True)
+            print("Changes committed successfully")
+        except subprocess.CalledProcessError:
+            # Check if there are changes to commit
+            status = subprocess.run(["git", "status", "--porcelain"], capture_output=True, text=True).stdout.strip()
+            if not status:
+                print("No changes to commit.")
+            else:
+                print("Error making commit. Will try to push existing commits.")
+        # Push to Space
+        print("Pushing to Hugging Face Space...")
+        # First try a normal push
+        try:
+            subprocess.run(["git", "push", "hf", "HEAD:main"], check=True)
+        except subprocess.CalledProcessError:
+            print("Normal push failed. Trying force push instead...")
+            try:
+                # Force push if normal push fails
+                subprocess.run(["git", "push", "-f", "hf", "HEAD:main"], check=True)
+            except subprocess.CalledProcessError as e:
+                print(f"Force push also failed: {e}")
+                print("Trying alternative push approach...")
+                # Most reliable way to push to HF Spaces
+                api_url = f"https://huggingface.co/spaces/{username}/{space_name}"
+                try:
+                    subprocess.run(["git", "remote", "set-url", "hf", api_url], check=True)
+                    subprocess.run(["git", "push", "-f", "--set-upstream", "hf", "HEAD:main"], check=True)
+                except subprocess.CalledProcessError as e:
+                    print(f"All push attempts failed. Final error: {e}")
+                    return False
+        print("\nSuccess! Your fixes have been pushed to Hugging Face Space.")
+        print(f"View your Space at: https://huggingface.co/spaces/{username}/{space_name}")
+        print("Note: It may take a few minutes for changes to appear.")
+        return True
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        return False
+if __name__ == "__main__":
+    if fix_403_errors():
+        print("403 error fixes successfully deployed!")
+    else:
+        print("Failed to deploy 403 error fixes. Please check the error messages above.")
+        sys.exit(1)