Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 22

Commit

622d3ee

verified ·

1 Parent(s): c9d5683

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -13

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import transformers
 import pickle
 import os
 import numpy as np
 import torchvision
 import nltk
@@ -158,6 +159,14 @@ def load_embeddings() -> Optional[Dict[str, np.ndarray]]:
         print(f"Error loading embeddings: {e}")
         return None
 def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
     try:
         embeddings_path = 'recipes_embeddings.safetensors'
@@ -176,8 +185,9 @@ def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
             keys = list(f.keys())
             for key in keys:
                 try:
                     tensor = f.get_tensor(key)
-                    embeddings[key] = tensor.numpy()
                 except Exception as key_error:
                     print(f"Failed to process key {key}: {key_error}")
@@ -291,8 +301,8 @@ def query_recipes_embeddings(query_embedding, embeddings_data=None, n_results=5)
         print("No embeddings data available.")
         return []
     try:
-        doc_ids = embeddings_data["doc_ids"]
-        doc_embeddings = embeddings_data["embeddings"]
         similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
         top_indices = similarities.argsort()[-n_results:][::-1]
         return [(doc_ids[i], similarities[i]) for i in top_indices]
@@ -332,10 +342,11 @@ def retrieve_document_texts(doc_ids, folder_path='downloaded_articles/downloaded
             texts.append("")
     return texts
-def retrieve_rec_texts(document_ids_or_names, folder_path):
     document_texts = []
     for doc_id in document_ids:
-        file_path = os.path.join(folder_path, doc_id)  # Match by file name
         if os.path.exists(file_path):
             with open(file_path, "r") as f:
                 document_texts.append(f.read())
@@ -743,18 +754,18 @@ async def recipes_endpoint(profile: MedicalProfile):
         recipes = []
         for score, doc_id, text in scored_documents:
             # Retrieve metadata for the document
-            doc_info = metadata_df[metadata_df["original_file_name"] == doc_id]
             if not doc_info.empty:
-                title = doc_info.iloc[0]["title"] if "title" in doc_info.columns else "Unknown Title"
-                if "recipe" in text.lower() or "meal" in text.lower():
-                    recipes.append({
-                        "id": doc_id,
-                        "title": title,
-                        "content_preview": text[:200],  # First 200 characters
-                        "score": score,
                     })
         # Limit the response to top 5 recipes
         return {"recipes": recipes[:5], "success": True}

 import transformers
 import pickle
 import os
+import re
 import numpy as np
 import torchvision
 import nltk
         print(f"Error loading embeddings: {e}")
         return None
+def normalize_key(key: str) -> str:
+    """Normalize embedding keys to match metadata IDs."""
+    match = re.search(r'file_(\d+)', key)
+    if match:
+        return match.group(1)  # Extract the numeric part
+    return key
 def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
     try:
         embeddings_path = 'recipes_embeddings.safetensors'
             keys = list(f.keys())
             for key in keys:
                 try:
+                    normalized_key = normalize_key(key)
                     tensor = f.get_tensor(key)
+                    embeddings[normalized_key] = tensor.numpy()
                 except Exception as key_error:
                     print(f"Failed to process key {key}: {key_error}")
         print("No embeddings data available.")
         return []
     try:
+        doc_ids = list(embeddings_data.keys())
+        doc_embeddings = np.array(list(embeddings_data.values()))
         similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
         top_indices = similarities.argsort()[-n_results:][::-1]
         return [(doc_ids[i], similarities[i]) for i in top_indices]
             texts.append("")
     return texts
+def retrieve_rec_texts(document_ids, folder_path):
     document_texts = []
     for doc_id in document_ids:
+        file_name = f"file_{doc_id}.html"  # Map numeric ID back to the original file name
+        file_path = os.path.join(folder_path, file_name)
         if os.path.exists(file_path):
             with open(file_path, "r") as f:
                 document_texts.append(f.read())
         recipes = []
         for score, doc_id, text in scored_documents:
             # Retrieve metadata for the document
+            doc_info = metadata_df[metadata_df["id"] == int(doc_id)]  # Match numeric ID
             if not doc_info.empty:
+                title = doc_info.iloc[0]["original_file_name"] if "original_file_name" in doc_info.columns else "Unknown Title"
+                recipes.append({
+                    "id": doc_id,
+                    "title": title,
+                    "content_preview": text[:200],  # First 200 characters
+                    "score": score,
                     })
         # Limit the response to top 5 recipes
         return {"recipes": recipes[:5], "success": True}