thechaiexperiment commited on
Commit
622d3ee
·
verified ·
1 Parent(s): c9d5683

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -13
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import transformers
2
  import pickle
3
  import os
 
4
  import numpy as np
5
  import torchvision
6
  import nltk
@@ -158,6 +159,14 @@ def load_embeddings() -> Optional[Dict[str, np.ndarray]]:
158
  print(f"Error loading embeddings: {e}")
159
  return None
160
 
 
 
 
 
 
 
 
 
161
  def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
162
  try:
163
  embeddings_path = 'recipes_embeddings.safetensors'
@@ -176,8 +185,9 @@ def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
176
  keys = list(f.keys())
177
  for key in keys:
178
  try:
 
179
  tensor = f.get_tensor(key)
180
- embeddings[key] = tensor.numpy()
181
  except Exception as key_error:
182
  print(f"Failed to process key {key}: {key_error}")
183
 
@@ -291,8 +301,8 @@ def query_recipes_embeddings(query_embedding, embeddings_data=None, n_results=5)
291
  print("No embeddings data available.")
292
  return []
293
  try:
294
- doc_ids = embeddings_data["doc_ids"]
295
- doc_embeddings = embeddings_data["embeddings"]
296
  similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
297
  top_indices = similarities.argsort()[-n_results:][::-1]
298
  return [(doc_ids[i], similarities[i]) for i in top_indices]
@@ -332,10 +342,11 @@ def retrieve_document_texts(doc_ids, folder_path='downloaded_articles/downloaded
332
  texts.append("")
333
  return texts
334
 
335
- def retrieve_rec_texts(document_ids_or_names, folder_path):
336
  document_texts = []
337
  for doc_id in document_ids:
338
- file_path = os.path.join(folder_path, doc_id) # Match by file name
 
339
  if os.path.exists(file_path):
340
  with open(file_path, "r") as f:
341
  document_texts.append(f.read())
@@ -743,18 +754,18 @@ async def recipes_endpoint(profile: MedicalProfile):
743
  recipes = []
744
  for score, doc_id, text in scored_documents:
745
  # Retrieve metadata for the document
746
- doc_info = metadata_df[metadata_df["original_file_name"] == doc_id]
747
  if not doc_info.empty:
748
- title = doc_info.iloc[0]["title"] if "title" in doc_info.columns else "Unknown Title"
749
- if "recipe" in text.lower() or "meal" in text.lower():
750
- recipes.append({
751
- "id": doc_id,
752
- "title": title,
753
- "content_preview": text[:200], # First 200 characters
754
- "score": score,
755
  })
756
 
757
 
 
758
  # Limit the response to top 5 recipes
759
  return {"recipes": recipes[:5], "success": True}
760
 
 
1
  import transformers
2
  import pickle
3
  import os
4
+ import re
5
  import numpy as np
6
  import torchvision
7
  import nltk
 
159
  print(f"Error loading embeddings: {e}")
160
  return None
161
 
162
+ def normalize_key(key: str) -> str:
163
+ """Normalize embedding keys to match metadata IDs."""
164
+ match = re.search(r'file_(\d+)', key)
165
+ if match:
166
+ return match.group(1) # Extract the numeric part
167
+ return key
168
+
169
+
170
  def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
171
  try:
172
  embeddings_path = 'recipes_embeddings.safetensors'
 
185
  keys = list(f.keys())
186
  for key in keys:
187
  try:
188
+ normalized_key = normalize_key(key)
189
  tensor = f.get_tensor(key)
190
+ embeddings[normalized_key] = tensor.numpy()
191
  except Exception as key_error:
192
  print(f"Failed to process key {key}: {key_error}")
193
 
 
301
  print("No embeddings data available.")
302
  return []
303
  try:
304
+ doc_ids = list(embeddings_data.keys())
305
+ doc_embeddings = np.array(list(embeddings_data.values()))
306
  similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
307
  top_indices = similarities.argsort()[-n_results:][::-1]
308
  return [(doc_ids[i], similarities[i]) for i in top_indices]
 
342
  texts.append("")
343
  return texts
344
 
345
+ def retrieve_rec_texts(document_ids, folder_path):
346
  document_texts = []
347
  for doc_id in document_ids:
348
+ file_name = f"file_{doc_id}.html" # Map numeric ID back to the original file name
349
+ file_path = os.path.join(folder_path, file_name)
350
  if os.path.exists(file_path):
351
  with open(file_path, "r") as f:
352
  document_texts.append(f.read())
 
754
  recipes = []
755
  for score, doc_id, text in scored_documents:
756
  # Retrieve metadata for the document
757
+ doc_info = metadata_df[metadata_df["id"] == int(doc_id)] # Match numeric ID
758
  if not doc_info.empty:
759
+ title = doc_info.iloc[0]["original_file_name"] if "original_file_name" in doc_info.columns else "Unknown Title"
760
+ recipes.append({
761
+ "id": doc_id,
762
+ "title": title,
763
+ "content_preview": text[:200], # First 200 characters
764
+ "score": score,
 
765
  })
766
 
767
 
768
+
769
  # Limit the response to top 5 recipes
770
  return {"recipes": recipes[:5], "success": True}
771