Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 23

Commit

d862e2d

verified ·

1 Parent(s): fee17f1

Update app.py

Browse files

Files changed (1) hide show

app.py +178 -65

app.py CHANGED Viewed

@@ -167,9 +167,23 @@ def normalize_key(key: str) -> str:
     return key
-def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
     try:
         embeddings_path = 'recipes_embeddings.safetensors'
         if not os.path.exists(embeddings_path):
             print("File not found locally. Attempting to download from Hugging Face Hub...")
             embeddings_path = hf_hub_download(
@@ -178,30 +192,30 @@ def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
                 repo_type="space"
             )
-        # Using safe_open from safetensors to load embeddings
-        embeddings = {}
-        from safetensors.numpy import safe_open
-        with safe_open(embeddings_path, framework="pt") as f:
-            keys = list(f.keys())
-            for key in keys:
-                try:
-                    normalized_key = normalize_key(key)
-                    tensor = f.get_tensor(key)
-                    embeddings[normalized_key] = tensor.numpy()
-                except Exception as key_error:
-                    print(f"Failed to process key {key}: {key_error}")
-        if embeddings:
-            print(f"Successfully loaded {len(embeddings)} embeddings.")
-        else:
-            print("No embeddings could be loaded. Please check the file format and content.")
-        return embeddings
     except Exception as e:
         print(f"Error loading embeddings: {e}")
         return None
 def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
     """Load document data from HTML articles in a specified folder."""
     try:
@@ -295,19 +309,42 @@ def query_embeddings(query_embedding, embeddings_data=None, n_results=5):
         print(f"Error in query_embeddings: {e}")
         return []
-def query_recipes_embeddings(query_embedding, embeddings_data=None, n_results=5):
     embeddings_data = load_recipes_embeddings()
-    if not embeddings_data:
         print("No embeddings data available.")
         return []
     try:
-        doc_ids = list(embeddings_data.keys())
-        doc_embeddings = np.array(list(embeddings_data.values()))
-        similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
         top_indices = similarities.argsort()[-n_results:][::-1]
-        return [(doc_ids[i], similarities[i]) for i in top_indices]
     except Exception as e:
-        print(f"Error in query_embeddings: {e}")
         return []
 def get_page_title(url):
@@ -342,31 +379,68 @@ def retrieve_document_texts(doc_ids, folder_path='downloaded_articles/downloaded
             texts.append("")
     return texts
-def retrieve_rec_texts(document_ids, folder_path='downloaded_articles/downloaded_articles', metadata_path = 'recipes_metadata.xlsx'):
-    # Load metadata file to map document IDs to original file names
-    metadata_path = 'recipes_metadata.xlsx'
-    metadata_df = pd.read_excel(metadata_path)
-    # Ensure column names are as expected
-    if "id" not in metadata_df.columns or "original_file_name" not in metadata_df.columns:
-        raise ValueError("Metadata file must contain 'id' and 'original_file_name' columns.")
-    # Create a mapping of ID to original file name
-    id_to_file_name = dict(zip(metadata_df["id"].astype(str), metadata_df["original_file_name"]))
-    document_texts = []
-    for doc_id in document_ids:
-        # Get the original file name for the given document ID
-        original_file_name = id_to_file_name.get(doc_id)
-        if not original_file_name:
-            print(f"Warning: No original file name found for document ID {doc_id}")
-            continue
-        # Construct the file path using the original file name
-        file_path = os.path.join(folder_path, original_file_name)
-        # Check if the file exists and read its content
-        if os.path.exists(file_path):
-            with open(file_path, "r", encoding="utf-8") as f:
-                document_texts.append(f.read())
-        else:
-            print(f"Warning: File not found for {file_path}")
-    return document_texts
@@ -773,6 +847,19 @@ async def resources_endpoint(profile: MedicalProfile):
 @app.post("/api/recipes")
 async def recipes_endpoint(profile: MedicalProfile):
     try:
@@ -795,55 +882,80 @@ async def recipes_endpoint(profile: MedicalProfile):
         initial_results = query_recipes_embeddings(query_embedding, embeddings_data, n_results=10)
         if not initial_results:
             raise ValueError("No relevant recipes found.")
         print(initial_results)
-        # Extract document IDs
-        document_ids = [doc_id for doc_id, _ in initial_results]
-        print(document_ids)
-        # Retrieve document texts
-        document_texts = retrieve_rec_texts(document_ids, folder_path)
         if not document_texts:
             raise ValueError("Failed to retrieve document texts.")
         print(document_texts)
-        # Load recipe metadata from DataFrame
-        folder_path='downloaded_articles/downloaded_articles'
-        file_path = 'recipes_metadata.xlsx'
-        metadata_path = 'recipes_metadata.xlsx'
-        metadata_df = pd.read_excel(file_path)
         relevant_portions = extract_relevant_portions(document_texts, query_text, max_portions=3, portion_size=1, min_query_words=1)
         print(relevant_portions)
         flattened_relevant_portions = []
         for doc_id, portions in relevant_portions.items():
             flattened_relevant_portions.extend(portions)
         unique_selected_parts = remove_duplicates(flattened_relevant_portions)
         print(unique_selected_parts)
         combined_parts = " ".join(unique_selected_parts)
         print(combined_parts)
         context = [query_text] + unique_selected_parts
         print(context)
         entities = extract_entities(query_text)
         print(entities)
         passage = enhance_passage_with_entities(combined_parts, entities)
         print(passage)
         prompt = create_prompt(query_text, passage)
         print(prompt)
         answer = generate_answer(prompt)
         print(answer)
         answer_part = answer.split("Answer:")[-1].strip()
-        print(answer_part)
         cleaned_answer = remove_answer_prefix(answer_part)
         print(cleaned_answer)
         final_answer = remove_incomplete_sentence(cleaned_answer)
-        print(final_answer )
         if language_code == 0:
             final_answer = translate_en_to_ar(final_answer)
         if final_answer:
             print("Answer:")
             print(final_answer)
         else:
             print("Sorry, I can't help with that.")
-        return {
-            "response": final_answer,
-        }
     except ValueError as ve:
         # Handle expected errors
@@ -853,6 +965,7 @@ async def recipes_endpoint(profile: MedicalProfile):
         print(f"Unexpected error: {e}")
         raise HTTPException(status_code=500, detail="An unexpected error occurred.")
 if not init_success:
     print("Warning: Application initialized with partial functionality")

     return key
+import os
+import numpy as np
+from typing import Optional
+from safetensors.numpy import load_file
+from huggingface_hub import hf_hub_download
+def load_recipes_embeddings() -> Optional[np.ndarray]:
+    """
+    Loads recipe embeddings from a .safetensors file, handling local and remote downloads.
+    Returns:
+        Optional[np.ndarray]: A numpy array containing all embeddings (shape: (num_recipes, embedding_dim)).
+    """
     try:
         embeddings_path = 'recipes_embeddings.safetensors'
+        # Check if file exists locally, otherwise download from Hugging Face Hub
         if not os.path.exists(embeddings_path):
             print("File not found locally. Attempting to download from Hugging Face Hub...")
             embeddings_path = hf_hub_download(
                 repo_type="space"
             )
+        # Load the embeddings tensor from the .safetensors file
+        embeddings = load_file(embeddings_path)
+        # Ensure the key 'embeddings' exists in the file
+        if "embeddings" not in embeddings:
+            raise ValueError("Key 'embeddings' not found in the safetensors file.")
+        # Retrieve the tensor as a numpy array
+        tensor = embeddings["embeddings"]
+        # Print information about the embeddings
+        print(f"Successfully loaded embeddings.")
+        print(f"Shape of embeddings: {tensor.shape}")
+        print(f"Dtype of embeddings: {tensor.dtype}")
+        print(f"First few values of the first embedding: {tensor[0][:5]}")
+        return tensor
     except Exception as e:
         print(f"Error loading embeddings: {e}")
         return None
 def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
     """Load document data from HTML articles in a specified folder."""
     try:
         print(f"Error in query_embeddings: {e}")
         return []
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+def query_recipes_embeddings(query_embedding: np.ndarray, n_results: int = 5):
+    """
+    Query the recipes embeddings to find the most similar recipes based on cosine similarity.
+    Args:
+        query_embedding (np.ndarray): A 1D numpy array representing the query embedding.
+        n_results (int): Number of top results to return.
+    Returns:
+        List[Tuple[int, float]]: A list of tuples containing the indices of the top results and their similarity scores.
+    """
+    # Load embeddings
     embeddings_data = load_recipes_embeddings()
+    if embeddings_data is None:
         print("No embeddings data available.")
         return []
     try:
+        # Ensure query_embedding is 2D for cosine similarity computation
+        if query_embedding.ndim == 1:
+            query_embedding = query_embedding.reshape(1, -1)
+        # Compute cosine similarity
+        similarities = cosine_similarity(query_embedding, embeddings_data).flatten()
+        # Get the indices of the top N most similar embeddings
         top_indices = similarities.argsort()[-n_results:][::-1]
+        # Return the indices and similarity scores of the top results
+        return [(index, similarities[index]) for index in top_indices]
     except Exception as e:
+        print(f"Error in query_recipes_embeddings: {e}")
         return []
 def get_page_title(url):
             texts.append("")
     return texts
+import os
+import pandas as pd
+def retrieve_rec_texts(
+    document_indices,
+    folder_path='downloaded_articles/downloaded_articles',
+    metadata_path='recipes_metadata.xlsx'
+):
+    """
+    Retrieve the texts of documents corresponding to the given indices.
+    Args:
+        document_indices (List[int]): A list of document indices to retrieve.
+        folder_path (str): Path to the folder containing the article files.
+        metadata_path (str): Path to the metadata file mapping indices to file names.
+    Returns:
+        List[str]: A list of document texts corresponding to the given indices.
+    """
+    try:
+        # Load metadata file to map indices to original file names
+        metadata_df = pd.read_excel(metadata_path)
+        # Ensure the metadata file has the required columns
+        if "id" not in metadata_df.columns or "original_file_name" not in metadata_df.columns:
+            raise ValueError("Metadata file must contain 'id' and 'original_file_name' columns.")
+        # Ensure the 'id' column aligns with the embeddings row indices
+        metadata_df = metadata_df.sort_values(by="id").reset_index(drop=True)
+        # Verify the alignment of metadata with embeddings indices
+        if metadata_df.index.max() < max(document_indices):
+            raise ValueError("Some document indices exceed the range of metadata.")
+        # Retrieve file names for the given indices
+        document_texts = []
+        for idx in document_indices:
+            if idx >= len(metadata_df):
+                print(f"Warning: Index {idx} is out of range for metadata.")
+                continue
+            original_file_name = metadata_df.iloc[idx]["original_file_name"]
+            if not original_file_name:
+                print(f"Warning: No file name found for index {idx}")
+                continue
+            # Construct the file path using the original file name
+            file_path = os.path.join(folder_path, original_file_name)
+            # Check if the file exists and read its content
+            if os.path.exists(file_path):
+                with open(file_path, "r", encoding="utf-8") as f:
+                    document_texts.append(f.read())
+            else:
+                print(f"Warning: File not found at {file_path}")
+        return document_texts
+    except Exception as e:
+        print(f"Error in retrieve_rec_texts: {e}")
+        return []
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import pandas as pd
+import numpy as np
+from typing import Optional, List
+app = FastAPI()
+# Define your profile model for input data
+class MedicalProfile(BaseModel):
+    conditions: str
+    daily_symptoms: str
 @app.post("/api/recipes")
 async def recipes_endpoint(profile: MedicalProfile):
     try:
         initial_results = query_recipes_embeddings(query_embedding, embeddings_data, n_results=10)
         if not initial_results:
             raise ValueError("No relevant recipes found.")
+        print("Initial results (document indices and similarities):")
         print(initial_results)
+        # Extract document indices from the results
+        document_indices = [doc_id for doc_id, _ in initial_results]
+        print("Document indices:", document_indices)
+        # Retrieve document texts using the indices
+        document_texts = retrieve_rec_texts(document_indices, folder_path)
         if not document_texts:
             raise ValueError("Failed to retrieve document texts.")
+        print("Document texts retrieved:")
         print(document_texts)
+        # Extract relevant portions from documents using the query text
         relevant_portions = extract_relevant_portions(document_texts, query_text, max_portions=3, portion_size=1, min_query_words=1)
+        print("Relevant portions extracted:")
         print(relevant_portions)
         flattened_relevant_portions = []
         for doc_id, portions in relevant_portions.items():
             flattened_relevant_portions.extend(portions)
         unique_selected_parts = remove_duplicates(flattened_relevant_portions)
+        print("Unique selected parts:")
         print(unique_selected_parts)
         combined_parts = " ".join(unique_selected_parts)
+        print("Combined text for context:")
         print(combined_parts)
         context = [query_text] + unique_selected_parts
+        print("Final context for answering:")
         print(context)
+        # Extract entities from the query
         entities = extract_entities(query_text)
+        print("Extracted entities:")
         print(entities)
+        # Enhance the passage with the extracted entities
         passage = enhance_passage_with_entities(combined_parts, entities)
+        print("Enhanced passage with entities:")
         print(passage)
+        # Create the prompt for the model
         prompt = create_prompt(query_text, passage)
+        print("Generated prompt:")
         print(prompt)
+        # Generate the answer from the model
         answer = generate_answer(prompt)
+        print("Generated answer:")
         print(answer)
+        # Clean up the answer to extract the relevant part
         answer_part = answer.split("Answer:")[-1].strip()
         cleaned_answer = remove_answer_prefix(answer_part)
+        print("Cleaned answer:")
         print(cleaned_answer)
         final_answer = remove_incomplete_sentence(cleaned_answer)
+        print("Final answer:")
+        print(final_answer)
         if language_code == 0:
             final_answer = translate_en_to_ar(final_answer)
         if final_answer:
             print("Answer:")
             print(final_answer)
         else:
             print("Sorry, I can't help with that.")
+        return {"response": final_answer}
     except ValueError as ve:
         # Handle expected errors
         print(f"Unexpected error: {e}")
         raise HTTPException(status_code=500, detail="An unexpected error occurred.")
 if not init_success:
     print("Warning: Application initialized with partial functionality")