Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Dec 11, 2024

Commit

3b1c99a

1 Parent(s): 554b5f1

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -49

app.py CHANGED Viewed

@@ -23,72 +23,93 @@ import time
 app = FastAPI()
 class ArticleEmbeddingUnpickler(pickle.Unpickler):
-    """Custom unpickler specifically designed for article embeddings"""
-    def find_class(self, module, name):
-        # Handle numpy arrays specially
         if module == 'numpy':
             return getattr(np, name)
-        # Handle the SentenceTransformer case
         if module == 'sentence_transformers.SentenceTransformer':
             from sentence_transformers import SentenceTransformer
             return SentenceTransformer
         return super().find_class(module, name)
-    def persistent_load(self, pid):
-        """Handle persistent IDs during unpickling"""
         try:
-            # Convert to string if bytes
             if isinstance(pid, bytes):
-                pid = pid.decode('utf-8', errors='ignore')
-            return str(pid)
         except Exception as e:
-            print(f"Error in persistent_load: {str(e)}")
-            return str(pid)
-def safe_load_embeddings(file_path='embeddings.pkl'):
-    """Load embeddings with enhanced error handling for article embeddings"""
     try:
         if not os.path.exists(file_path):
-            print(f"Embeddings file not found at {file_path}")
-            return None
         with open(file_path, 'rb') as file:
             unpickler = ArticleEmbeddingUnpickler(file)
             embeddings_data = unpickler.load()
-            # Validate the dictionary structure
-            if not isinstance(embeddings_data, dict):
-                print(f"Invalid data structure: expected dict, got {type(embeddings_data)}")
-                return None
-            # Validate each embedding
-            valid_embeddings = {}
-            for key, value in embeddings_data.items():
-                # Ensure key is string and value is numpy array
-                try:
-                    key_str = str(key)
-                    if isinstance(value, list):
-                        value = np.array(value)
-                    if isinstance(value, np.ndarray):
-                        valid_embeddings[key_str] = value
-                    else:
-                        print(f"Skipping invalid embedding for key {key}: {type(value)}")
-                except Exception as e:
-                    print(f"Error processing embedding {key}: {str(e)}")
                     continue
-            if not valid_embeddings:
-                print("No valid embeddings found in file")
-                return None
-            print(f"Successfully loaded {len(valid_embeddings)} embeddings")
-            return valid_embeddings
-    except Exception as e:
-        print(f"Error loading embeddings: {str(e)}")
-        return None
 # Models and data structures
 class GlobalModels:
@@ -129,16 +150,16 @@ class DocumentResponse(BaseModel):
 # Modified startup event handler
 @app.on_event("startup")
 async def load_models():
-    """Initialize models with enhanced embeddings loading"""
     try:
-        # Load embeddings first
         embeddings_data = safe_load_embeddings()
-        if embeddings_data is None:
-            raise HTTPException(
-                status_code=500,
-                detail="Failed to load embeddings data. Check logs for details."
-            )
         # Load embedding models first
         global_models.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

 app = FastAPI()
 class ArticleEmbeddingUnpickler(pickle.Unpickler):
+    """Custom unpickler for article embeddings with enhanced persistence handling"""
+    def find_class(self, module: str, name: str) -> Any:
         if module == 'numpy':
             return getattr(np, name)
         if module == 'sentence_transformers.SentenceTransformer':
             from sentence_transformers import SentenceTransformer
             return SentenceTransformer
         return super().find_class(module, name)
+    def persistent_load(self, pid: Any) -> str:
+        """Enhanced persistent ID handler with better encoding management"""
         try:
+            # Handle different types of persistent IDs
             if isinstance(pid, bytes):
+                return pid.decode('utf-8', errors='replace')
+            if isinstance(pid, (str, int, float)):
+                return str(pid)
+            return repr(pid)
         except Exception as e:
+            print(f"Warning: Error in persistent_load: {str(e)}")
+            return repr(pid)
+def safe_load_embeddings(file_path: str = 'embeddings.pkl') -> Dict[str, np.ndarray]:
+    """Load embeddings with enhanced error handling and validation"""
     try:
         if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Embeddings file not found at {file_path}")
         with open(file_path, 'rb') as file:
             unpickler = ArticleEmbeddingUnpickler(file)
             embeddings_data = unpickler.load()
+        if not isinstance(embeddings_data, dict):
+            raise ValueError(f"Invalid data structure: expected dict, got {type(embeddings_data)}")
+        # Process and validate embeddings
+        valid_embeddings = {}
+        for key, value in embeddings_data.items():
+            try:
+                # Ensure key is a valid string
+                key_str = str(key).strip()
+                if not key_str:
+                    continue
+                # Convert value to numpy array if needed
+                if isinstance(value, list):
+                    value = np.array(value, dtype=np.float32)
+                elif isinstance(value, np.ndarray):
+                    value = value.astype(np.float32)
+                else:
+                    print(f"Skipping invalid embedding type for key {key_str}: {type(value)}")
                     continue
+                # Validate array dimensions and values
+                if value.ndim != 1:
+                    print(f"Skipping invalid embedding shape for key {key_str}: {value.shape}")
+                    continue
+                if np.isnan(value).any() or np.isinf(value).any():
+                    print(f"Skipping embedding with invalid values for key {key_str}")
+                    continue
+                valid_embeddings[key_str] = value
+            except Exception as e:
+                print(f"Error processing embedding for key {key}: {str(e)}")
+                continue
+        if not valid_embeddings:
+            raise ValueError("No valid embeddings found in file")
+        print(f"Successfully loaded {len(valid_embeddings)} valid embeddings")
+        return valid_embeddings
+    except Exception as e:
+        print(f"Error loading embeddings: {str(e)}")
+        raise
+def safe_save_embeddings(embeddings_dict, file_path='embeddings.pkl'):
+    # Convert all keys to ASCII-safe strings
+    cleaned_embeddings = {
+        str(key).encode('ascii', errors='replace').decode('ascii'): value
+        for key, value in embeddings_dict.items()
+    }
+    with open(file_path, 'wb') as f:
+        pickle.dump(cleaned_embeddings, f, protocol=0)
 # Models and data structures
 class GlobalModels:
 # Modified startup event handler
 @app.on_event("startup")
+@app.on_event("startup")
 async def load_models():
     try:
+        print("Starting to load embeddings...")
         embeddings_data = safe_load_embeddings()
+        print(f"Embeddings data type: {type(embeddings_data)}")
+        if embeddings_data:
+            print(f"Number of embeddings: {len(embeddings_data)}")
+            # Print sample of keys
+            print("Sample keys:", list(embeddings_data.keys())[:3])
         # Load embedding models first
         global_models.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')