Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 8

Commit

5635397

1 Parent(s): f342c38

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -23

app.py CHANGED Viewed

@@ -86,15 +86,18 @@ import pickle
 import numpy as np
 import os
 from typing import Dict, Optional
-class EmbeddingsUnpickler(pickle.Unpickler):
     def persistent_load(self, pid):
-        # Handle persistent IDs by returning them as-is
-        return pid
 def load_embeddings(embeddings_path: str = 'embeddings.pkl') -> Optional[Dict[str, np.ndarray]]:
     """
-    Load embeddings from a pickle file with support for persistent IDs.
     Args:
         embeddings_path (str): Path to the pickle file containing embeddings
@@ -107,37 +110,53 @@ def load_embeddings(embeddings_path: str = 'embeddings.pkl') -> Optional[Dict[st
         return None
     try:
-        with open(embeddings_path, 'rb') as f:
-            # Use custom unpickler with persistent_load support
-            unpickler = EmbeddingsUnpickler(f)
-            embeddings = unpickler.load()
         # Validate the loaded data
         if not isinstance(embeddings, dict):
             print(f"Error: Expected dict, got {type(embeddings)}")
             return None
-        # Convert values to numpy arrays if they aren't already
         processed_embeddings = {}
         for key, value in embeddings.items():
-            # Handle both direct arrays and persistent IDs
-            if isinstance(value, (list, np.ndarray)):
-                processed_embeddings[key] = np.array(value)
-            else:
-                # If it's a persistent ID, convert it to a numpy array
-                try:
-                    processed_embeddings[key] = np.array(value)
-                except Exception as e:
-                    print(f"Warning: Could not convert embedding for {key}: {e}")
-                    continue
-        # Print sample for verification
         if processed_embeddings:
             sample_key = next(iter(processed_embeddings))
             print(f"Data type: {type(processed_embeddings)}")
-            print(f"First few keys and values:")
-            print(f"Key: {sample_key}, Value: {processed_embeddings[sample_key][:20]}")
-            print(f"Successfully loaded {len(processed_embeddings)} embeddings")
             return processed_embeddings
         else:
             print("Error: No valid embeddings were processed")
@@ -145,8 +164,10 @@ def load_embeddings(embeddings_path: str = 'embeddings.pkl') -> Optional[Dict[st
     except Exception as e:
         print(f"Error loading embeddings: {str(e)}")
         return None
 def load_documents_data():
     """Load document data with error handling"""
     try:

 import numpy as np
 import os
 from typing import Dict, Optional
+import codecs
+class LFSEmbeddingsUnpickler(pickle.Unpickler):
     def persistent_load(self, pid):
+        # Ensure persistent ID is ASCII string
+        if isinstance(pid, bytes):
+            return pid.decode('ascii')
+        return str(pid)
 def load_embeddings(embeddings_path: str = 'embeddings.pkl') -> Optional[Dict[str, np.ndarray]]:
     """
+    Load embeddings from a pickle file with support for Git LFS and protocol 0 requirements.
     Args:
         embeddings_path (str): Path to the pickle file containing embeddings
         return None
     try:
+        # Open file in binary mode with buffering
+        with open(embeddings_path, 'rb', buffering=1024*1024) as f:
+            # Check if it's a Git LFS pointer file
+            first_line = f.peek(100)[:100].decode('utf-8', errors='ignore')
+            if 'version https://git-lfs.github.com/spec/' in first_line:
+                print("Warning: This appears to be a Git LFS pointer file.")
+                print("Please ensure you've properly downloaded the actual embeddings file using Git LFS")
+                return None
+            # Use custom unpickler with ASCII string handling
+            unpickler = LFSEmbeddingsUnpickler(f)
+            # Set encoding for protocol 0 compatibility
+            if hasattr(unpickler, 'encoding'):
+                unpickler.encoding = 'ascii'
+            try:
+                embeddings = unpickler.load()
+            except UnicodeDecodeError:
+                # If ASCII decode fails, try UTF-8
+                f.seek(0)
+                unpickler = pickle.Unpickler(f)
+                embeddings = unpickler.load()
         # Validate the loaded data
         if not isinstance(embeddings, dict):
             print(f"Error: Expected dict, got {type(embeddings)}")
             return None
+        # Convert values to numpy arrays
         processed_embeddings = {}
         for key, value in embeddings.items():
+            try:
+                # Handle various input types
+                if isinstance(value, np.ndarray):
+                    processed_embeddings[key] = value
+                else:
+                    processed_embeddings[key] = np.array(value, dtype=np.float32)
+            except Exception as e:
+                print(f"Warning: Could not process embedding for {key}: {e}")
+                continue
         if processed_embeddings:
             sample_key = next(iter(processed_embeddings))
             print(f"Data type: {type(processed_embeddings)}")
+            print(f"Total embeddings loaded: {len(processed_embeddings)}")
+            print(f"Sample embedding shape: {processed_embeddings[sample_key].shape}")
             return processed_embeddings
         else:
             print("Error: No valid embeddings were processed")
     except Exception as e:
         print(f"Error loading embeddings: {str(e)}")
+        print("If using Git LFS, ensure you've run 'git lfs pull' to download the actual file")
         return None
 def load_documents_data():
     """Load document data with error handling"""
     try: