Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 22

Commit

85619b4

verified ·

1 Parent(s): 622d3ee

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -17

app.py CHANGED Viewed

@@ -342,17 +342,33 @@ def retrieve_document_texts(doc_ids, folder_path='downloaded_articles/downloaded
             texts.append("")
     return texts
-def retrieve_rec_texts(document_ids, folder_path):
-    document_texts = []
     for doc_id in document_ids:
-        file_name = f"file_{doc_id}.html"  # Map numeric ID back to the original file name
-        file_path = os.path.join(folder_path, file_name)
         if os.path.exists(file_path):
-            with open(file_path, "r") as f:
                 document_texts.append(f.read())
     return document_texts
 def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
     try:
         # Prepare pairs for the cross-encoder
@@ -725,7 +741,7 @@ async def recipes_endpoint(profile: MedicalProfile):
         # Load embeddings and retrieve initial results
         embeddings_data = load_recipes_embeddings()
         folder_path = 'downloaded_articles/downloaded_articles'
-        initial_results = query_embeddings(query_embedding, embeddings_data, n_results=10)
         if not initial_results:
             raise ValueError("No relevant recipes found.")
@@ -748,22 +764,17 @@ async def recipes_endpoint(profile: MedicalProfile):
         # Load recipe metadata from DataFrame
         file_path = 'recipes_metadata.xlsx'
         metadata_df = pd.read_excel(file_path)
         # Prepare the final recipes list
         recipes = []
-        for score, doc_id, text in scored_documents:
-            # Retrieve metadata for the document
-            doc_info = metadata_df[metadata_df["id"] == int(doc_id)]  # Match numeric ID
-            if not doc_info.empty:
-                title = doc_info.iloc[0]["original_file_name"] if "original_file_name" in doc_info.columns else "Unknown Title"
-                recipes.append({
-                    "id": doc_id,
-                    "title": title,
-                    "content_preview": text[:200],  # First 200 characters
-                    "score": score,
-                    })
         # Limit the response to top 5 recipes

             texts.append("")
     return texts
+def retrieve_rec_texts(document_ids, folder_path, metadata_path):
+    # Load metadata file to map document IDs to original file names
+    metadata_df = pd.read_excel(metadata_path)
+    # Ensure column names are as expected
+    if "id" not in metadata_df.columns or "original_file_name" not in metadata_df.columns:
+        raise ValueError("Metadata file must contain 'id' and 'original_file_name' columns.")
+    # Create a mapping of ID to original file name
+    id_to_file_name = dict(zip(metadata_df["id"].astype(str), metadata_df["original_file_name"]))
+    document_texts = []
     for doc_id in document_ids:
+        # Get the original file name for the given document ID
+        original_file_name = id_to_file_name.get(doc_id)
+        if not original_file_name:
+            print(f"Warning: No original file name found for document ID {doc_id}")
+            continue
+        # Construct the file path using the original file name
+        file_path = os.path.join(folder_path, original_file_name)
+        # Check if the file exists and read its content
         if os.path.exists(file_path):
+            with open(file_path, "r", encoding="utf-8") as f:
                 document_texts.append(f.read())
+        else:
+            print(f"Warning: File not found for {file_path}")
     return document_texts
 def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
     try:
         # Prepare pairs for the cross-encoder
         # Load embeddings and retrieve initial results
         embeddings_data = load_recipes_embeddings()
         folder_path = 'downloaded_articles/downloaded_articles'
+        initial_results = query_recipes_embeddings(query_embedding, embeddings_data, n_results=10)
         if not initial_results:
             raise ValueError("No relevant recipes found.")
         # Load recipe metadata from DataFrame
         file_path = 'recipes_metadata.xlsx'
+        metadata_path = 'recipes_metadata.xlsx'
         metadata_df = pd.read_excel(file_path)
         # Prepare the final recipes list
         recipes = []
+        # Combine scores with resources
+        for i, recipe in enumerate(recipes):
+            recipe["score"] = scores[i] if i < len(scores) else 0.0
+        # Sort resources by score
+        recipes.sort(key=lambda x: x["score"], reverse=True)
         # Limit the response to top 5 recipes