thechaiexperiment commited on
Commit
85619b4
·
verified ·
1 Parent(s): 622d3ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -17
app.py CHANGED
@@ -342,17 +342,33 @@ def retrieve_document_texts(doc_ids, folder_path='downloaded_articles/downloaded
342
  texts.append("")
343
  return texts
344
 
345
- def retrieve_rec_texts(document_ids, folder_path):
346
- document_texts = []
 
 
 
 
 
 
 
347
  for doc_id in document_ids:
348
- file_name = f"file_{doc_id}.html" # Map numeric ID back to the original file name
349
- file_path = os.path.join(folder_path, file_name)
 
 
 
 
 
 
350
  if os.path.exists(file_path):
351
- with open(file_path, "r") as f:
352
  document_texts.append(f.read())
 
 
353
  return document_texts
354
 
355
 
 
356
  def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
357
  try:
358
  # Prepare pairs for the cross-encoder
@@ -725,7 +741,7 @@ async def recipes_endpoint(profile: MedicalProfile):
725
  # Load embeddings and retrieve initial results
726
  embeddings_data = load_recipes_embeddings()
727
  folder_path = 'downloaded_articles/downloaded_articles'
728
- initial_results = query_embeddings(query_embedding, embeddings_data, n_results=10)
729
  if not initial_results:
730
  raise ValueError("No relevant recipes found.")
731
 
@@ -748,22 +764,17 @@ async def recipes_endpoint(profile: MedicalProfile):
748
 
749
  # Load recipe metadata from DataFrame
750
  file_path = 'recipes_metadata.xlsx'
 
751
  metadata_df = pd.read_excel(file_path)
752
 
753
  # Prepare the final recipes list
754
  recipes = []
755
- for score, doc_id, text in scored_documents:
756
- # Retrieve metadata for the document
757
- doc_info = metadata_df[metadata_df["id"] == int(doc_id)] # Match numeric ID
758
- if not doc_info.empty:
759
- title = doc_info.iloc[0]["original_file_name"] if "original_file_name" in doc_info.columns else "Unknown Title"
760
- recipes.append({
761
- "id": doc_id,
762
- "title": title,
763
- "content_preview": text[:200], # First 200 characters
764
- "score": score,
765
- })
766
 
 
 
767
 
768
 
769
  # Limit the response to top 5 recipes
 
342
  texts.append("")
343
  return texts
344
 
345
+ def retrieve_rec_texts(document_ids, folder_path, metadata_path):
346
+ # Load metadata file to map document IDs to original file names
347
+ metadata_df = pd.read_excel(metadata_path)
348
+ # Ensure column names are as expected
349
+ if "id" not in metadata_df.columns or "original_file_name" not in metadata_df.columns:
350
+ raise ValueError("Metadata file must contain 'id' and 'original_file_name' columns.")
351
+ # Create a mapping of ID to original file name
352
+ id_to_file_name = dict(zip(metadata_df["id"].astype(str), metadata_df["original_file_name"]))
353
+ document_texts = []
354
  for doc_id in document_ids:
355
+ # Get the original file name for the given document ID
356
+ original_file_name = id_to_file_name.get(doc_id)
357
+ if not original_file_name:
358
+ print(f"Warning: No original file name found for document ID {doc_id}")
359
+ continue
360
+ # Construct the file path using the original file name
361
+ file_path = os.path.join(folder_path, original_file_name)
362
+ # Check if the file exists and read its content
363
  if os.path.exists(file_path):
364
+ with open(file_path, "r", encoding="utf-8") as f:
365
  document_texts.append(f.read())
366
+ else:
367
+ print(f"Warning: File not found for {file_path}")
368
  return document_texts
369
 
370
 
371
+
372
  def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
373
  try:
374
  # Prepare pairs for the cross-encoder
 
741
  # Load embeddings and retrieve initial results
742
  embeddings_data = load_recipes_embeddings()
743
  folder_path = 'downloaded_articles/downloaded_articles'
744
+ initial_results = query_recipes_embeddings(query_embedding, embeddings_data, n_results=10)
745
  if not initial_results:
746
  raise ValueError("No relevant recipes found.")
747
 
 
764
 
765
  # Load recipe metadata from DataFrame
766
  file_path = 'recipes_metadata.xlsx'
767
+ metadata_path = 'recipes_metadata.xlsx'
768
  metadata_df = pd.read_excel(file_path)
769
 
770
  # Prepare the final recipes list
771
  recipes = []
772
+ # Combine scores with resources
773
+ for i, recipe in enumerate(recipes):
774
+ recipe["score"] = scores[i] if i < len(scores) else 0.0
 
 
 
 
 
 
 
 
775
 
776
+ # Sort resources by score
777
+ recipes.sort(key=lambda x: x["score"], reverse=True)
778
 
779
 
780
  # Limit the response to top 5 recipes