Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 20

Commit

66387cc

1 Parent(s): 8497042

Update app.py

Browse files

Files changed (1) hide show

app.py +180 -9

app.py CHANGED Viewed

@@ -7,6 +7,9 @@ import nltk
 import torch
 import pandas as pd
 import requests
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
@@ -130,7 +133,7 @@ def load_embeddings() -> Optional[Dict[str, np.ndarray]]:
         # Open the safetensors file
         with safe_open(embeddings_path, framework="pt") as f:
             keys = f.keys()
-            print(f"Available keys in the .safetensors file: {list(keys)}")  # Debugging info
             # Iterate over the keys and load tensors
             for key in keys:
@@ -155,6 +158,46 @@ def load_embeddings() -> Optional[Dict[str, np.ndarray]]:
         print(f"Error loading embeddings: {e}")
         return None
 def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
     """Load document data from HTML articles in a specified folder."""
@@ -195,16 +238,87 @@ def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
         data['df'] = pd.DataFrame()
         return False
 def load_data():
     """Load all required data"""
     embeddings_success = load_embeddings()
     documents_success = load_documents_data()
-    if not embeddings_success:
         print("Warning: Failed to load embeddings, falling back to basic functionality")
-    if not documents_success:
         print("Warning: Failed to load documents data, falling back to basic functionality")
     return True
 # Initialize application
@@ -248,6 +362,21 @@ def query_embeddings(query_embedding, embeddings_data=None, n_results=5):
         print(f"Error in query_embeddings: {e}")
         return []
 def get_page_title(url):
     try:
         response = requests.get(url)
@@ -280,6 +409,48 @@ def retrieve_document_texts(doc_ids, folder_path='downloaded_articles/downloaded
             texts.append("")
     return texts
 def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
     try:
@@ -646,9 +817,9 @@ async def recipes_endpoint(profile: MedicalProfile):
             raise ValueError("Failed to generate query embedding.")
         # Load embeddings and retrieve initial results
-        embeddings_data = load_embeddings()
-        folder_path = 'downloaded_articles/downloaded_articles'
-        initial_results = query_embeddings(query_embedding, embeddings_data, n_results=10)
         if not initial_results:
             raise ValueError("No relevant recipes found.")
@@ -656,7 +827,7 @@ async def recipes_endpoint(profile: MedicalProfile):
         document_ids = [doc_id for doc_id, _ in initial_results]
         # Retrieve document texts
-        document_texts = retrieve_document_texts(document_ids, folder_path)
         if not document_texts:
             raise ValueError("Failed to retrieve document texts.")

 import torch
 import pandas as pd
 import requests
+import zipfile
+import tempfile
+from PyPDF2 import PdfReader
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
         # Open the safetensors file
         with safe_open(embeddings_path, framework="pt") as f:
             keys = f.keys()
+            #0print(f"Available keys in the .safetensors file: {list(keys)}")  # Debugging info
             # Iterate over the keys and load tensors
             for key in keys:
         print(f"Error loading embeddings: {e}")
         return None
+def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
+    try:
+        # Locate or download the embeddings file
+        embeddings_path = 'recipes_embeddings.safetensors'
+        if not os.path.exists(embeddings_path):
+            print("File not found locally. Attempting to download from Hugging Face Hub...")
+            embeddings_path = hf_hub_download(
+                repo_id=os.environ.get('HF_SPACE_ID', 'thechaiexperiment/TeaRAG'),
+                filename="embeddings.safetensors",
+                repo_type="space"
+            )
+        # Initialize a dictionary to store embeddings
+        embeddings = {}
+        # Open the safetensors file
+        with safe_open(embeddings_path, framework="pt") as f:
+            keys = list(f.keys())
+            #print(f"Available keys in the .safetensors file: {keys}")  # Debugging info
+            # Iterate over the keys and load tensors
+            for key in keys:
+                try:
+                    tensor = f.get_tensor(key)  # Get the tensor associated with the key
+                    if tensor.shape[0] != 384:  # Optional: Validate tensor shape
+                        print(f"Warning: Tensor for key {key} has unexpected shape {tensor.shape}")
+                    # Convert tensor to NumPy array
+                    embeddings[key] = tensor.numpy()
+                except Exception as key_error:
+                    print(f"Failed to process key {key}: {key_error}")
+        if embeddings:
+            print(f"Successfully loaded {len(embeddings)} embeddings.")
+        else:
+            print("No embeddings could be loaded. Please check the file format and content.")
+        return embeddings
+    except Exception as e:
+        print(f"Error loading embeddings: {e}")
+        return None
 def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
     """Load document data from HTML articles in a specified folder."""
         data['df'] = pd.DataFrame()
         return False
+def load_recipes_data(folder_path='pdf kb.zip'):
+    try:
+        print("Loading documents data...")
+        temp_dir = None
+        # Handle .zip file
+        if folder_path.endswith('.zip'):
+            if not os.path.exists(folder_path):
+                print(f"Error: .zip file '{folder_path}' not found.")
+                return False
+            # Create a temporary directory for extracting the .zip
+            temp_dir = tempfile.TemporaryDirectory()
+            extract_path = temp_dir.name
+            # Extract the .zip file
+            try:
+                with zipfile.ZipFile(folder_path, 'r') as zip_ref:
+                    zip_ref.extractall(extract_path)
+                print(f"Extracted .zip file to temporary folder: {extract_path}")
+            except Exception as e:
+                print(f"Error extracting .zip file: {e}")
+                return False
+            # Update the folder_path to the extracted directory
+            folder_path = extract_path
+        # Check if the folder exists
+        if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
+            print(f"Error: Folder '{folder_path}' not found.")
+            return False
+        # List all HTML or PDF files in the folder
+        html_files = [f for f in os.listdir(folder_path) if f.endswith('.html')]
+        pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
+        if not html_files and not pdf_files:
+            print(f"No HTML or PDF files found in folder '{folder_path}'.")
+            return False
+        documents = []
+        # Process PDF files (requires a PDF parser like PyPDF2)
+        for file_name in pdf_files:
+            file_path = os.path.join(folder_path, file_name)
+            try:
+                from PyPDF2 import PdfReader  # Import here to avoid dependency issues
+                reader = PdfReader(file_path)
+                text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
+                documents.append({"file_name": file_name, "content": text})
+            except Exception as e:
+                print(f"Error reading PDF file {file_name}: {e}")
+        # Convert the list of documents to a DataFrame
+        data['df'] = pd.DataFrame(documents)
+        if data['df'].empty:
+            print("No valid documents loaded.")
+            return False
+        print(f"Successfully loaded {len(data['df'])} document records.")
+        return True
+    except Exception as e:
+        print(f"Error loading documents data: {e}")
+        data['df'] = pd.DataFrame()
+        return False
+    finally:
+        # Clean up the temporary directory, if created
+        if temp_dir:
+            temp_dir.cleanup()
 def load_data():
     """Load all required data"""
     embeddings_success = load_embeddings()
     documents_success = load_documents_data()
+    recipes_success = load_recipes_data()
+    recipes_embeddings_success = load_recipes_embeddings()
+    if not recipes_embeddings_success:
         print("Warning: Failed to load embeddings, falling back to basic functionality")
+    if not recipes_success:
         print("Warning: Failed to load documents data, falling back to basic functionality")
     return True
 # Initialize application
         print(f"Error in query_embeddings: {e}")
         return []
+def query_recipes_embeddings(query_embedding, embeddings_data=None, n_results=5):
+    embeddings_data = load_recipes_embeddings()
+    if not embeddings_data:
+        print("No embeddings data available.")
+        return []
+    try:
+        doc_ids = list(embeddings_data.keys())
+        doc_embeddings = np.array(list(embeddings_data.values()))
+        similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
+        top_indices = similarities.argsort()[-n_results:][::-1]
+        return [(doc_ids[i], similarities[i]) for i in top_indices]
+    except Exception as e:
+        print(f"Error in query_embeddings: {e}")
+        return []
 def get_page_title(url):
     try:
         response = requests.get(url)
             texts.append("")
     return texts
+def retrieve_recipes_texts(doc_ids, zip_path='pdf kb.zip'):
+    texts = []
+    try:
+        # Check if the .zip file exists
+        if not os.path.exists(zip_path):
+            print(f"Error: Zip file not found at '{zip_path}'")
+            return ["" for _ in doc_ids]
+        # Create a temporary directory to extract the .zip contents
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                zip_ref.extractall(temp_dir)  # Extract all files to the temp directory
+            # Iterate through the document IDs
+            for doc_id in doc_ids:
+                # Construct the expected PDF file path
+                pdf_path = os.path.join(temp_dir, f"{doc_id}.pdf")
+                try:
+                    # Check if the PDF file exists
+                    if not os.path.exists(pdf_path):
+                        print(f"Warning: PDF file not found: {pdf_path}")
+                        texts.append("")
+                        continue
+                    # Read and extract text from the PDF
+                    with open(pdf_path, 'rb') as pdf_file:
+                        reader = PdfReader(pdf_file)
+                        pdf_text = ""
+                        for page in reader.pages:
+                            pdf_text += page.extract_text()
+                        # Add the extracted text to the result list
+                        texts.append(pdf_text.strip())
+                except Exception as e:
+                    print(f"Error retrieving text from document {doc_id}: {e}")
+                    texts.append("")
+    except Exception as e:
+        print(f"Error handling zip file: {e}")
+        return ["" for _ in doc_ids]
+    return texts
 def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
     try:
             raise ValueError("Failed to generate query embedding.")
         # Load embeddings and retrieve initial results
+        embeddings_data = load_recipes_embeddings()
+        folder_path = 'pdf kb.zip'
+        initial_results = query_recipes_embeddings(query_embedding, embeddings_data, n_results=10)
         if not initial_results:
             raise ValueError("No relevant recipes found.")
         document_ids = [doc_id for doc_id, _ in initial_results]
         # Retrieve document texts
+        document_texts = retrieve_recipes_texts(document_ids, folder_path)
         if not document_texts:
             raise ValueError("Failed to retrieve document texts.")