Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import transformers
|
2 |
import pickle
|
3 |
import os
|
|
|
4 |
import numpy as np
|
5 |
import torchvision
|
6 |
import nltk
|
@@ -158,6 +159,14 @@ def load_embeddings() -> Optional[Dict[str, np.ndarray]]:
|
|
158 |
print(f"Error loading embeddings: {e}")
|
159 |
return None
|
160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
|
162 |
try:
|
163 |
embeddings_path = 'recipes_embeddings.safetensors'
|
@@ -176,8 +185,9 @@ def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
|
|
176 |
keys = list(f.keys())
|
177 |
for key in keys:
|
178 |
try:
|
|
|
179 |
tensor = f.get_tensor(key)
|
180 |
-
embeddings[
|
181 |
except Exception as key_error:
|
182 |
print(f"Failed to process key {key}: {key_error}")
|
183 |
|
@@ -291,8 +301,8 @@ def query_recipes_embeddings(query_embedding, embeddings_data=None, n_results=5)
|
|
291 |
print("No embeddings data available.")
|
292 |
return []
|
293 |
try:
|
294 |
-
doc_ids = embeddings_data
|
295 |
-
doc_embeddings = embeddings_data
|
296 |
similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
|
297 |
top_indices = similarities.argsort()[-n_results:][::-1]
|
298 |
return [(doc_ids[i], similarities[i]) for i in top_indices]
|
@@ -332,10 +342,11 @@ def retrieve_document_texts(doc_ids, folder_path='downloaded_articles/downloaded
|
|
332 |
texts.append("")
|
333 |
return texts
|
334 |
|
335 |
-
def retrieve_rec_texts(
|
336 |
document_texts = []
|
337 |
for doc_id in document_ids:
|
338 |
-
|
|
|
339 |
if os.path.exists(file_path):
|
340 |
with open(file_path, "r") as f:
|
341 |
document_texts.append(f.read())
|
@@ -743,18 +754,18 @@ async def recipes_endpoint(profile: MedicalProfile):
|
|
743 |
recipes = []
|
744 |
for score, doc_id, text in scored_documents:
|
745 |
# Retrieve metadata for the document
|
746 |
-
doc_info = metadata_df[metadata_df["
|
747 |
if not doc_info.empty:
|
748 |
-
title = doc_info.iloc[0]["
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
"score": score,
|
755 |
})
|
756 |
|
757 |
|
|
|
758 |
# Limit the response to top 5 recipes
|
759 |
return {"recipes": recipes[:5], "success": True}
|
760 |
|
|
|
1 |
import transformers
|
2 |
import pickle
|
3 |
import os
|
4 |
+
import re
|
5 |
import numpy as np
|
6 |
import torchvision
|
7 |
import nltk
|
|
|
159 |
print(f"Error loading embeddings: {e}")
|
160 |
return None
|
161 |
|
162 |
+
def normalize_key(key: str) -> str:
|
163 |
+
"""Normalize embedding keys to match metadata IDs."""
|
164 |
+
match = re.search(r'file_(\d+)', key)
|
165 |
+
if match:
|
166 |
+
return match.group(1) # Extract the numeric part
|
167 |
+
return key
|
168 |
+
|
169 |
+
|
170 |
def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
|
171 |
try:
|
172 |
embeddings_path = 'recipes_embeddings.safetensors'
|
|
|
185 |
keys = list(f.keys())
|
186 |
for key in keys:
|
187 |
try:
|
188 |
+
normalized_key = normalize_key(key)
|
189 |
tensor = f.get_tensor(key)
|
190 |
+
embeddings[normalized_key] = tensor.numpy()
|
191 |
except Exception as key_error:
|
192 |
print(f"Failed to process key {key}: {key_error}")
|
193 |
|
|
|
301 |
print("No embeddings data available.")
|
302 |
return []
|
303 |
try:
|
304 |
+
doc_ids = list(embeddings_data.keys())
|
305 |
+
doc_embeddings = np.array(list(embeddings_data.values()))
|
306 |
similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
|
307 |
top_indices = similarities.argsort()[-n_results:][::-1]
|
308 |
return [(doc_ids[i], similarities[i]) for i in top_indices]
|
|
|
342 |
texts.append("")
|
343 |
return texts
|
344 |
|
345 |
+
def retrieve_rec_texts(document_ids, folder_path):
|
346 |
document_texts = []
|
347 |
for doc_id in document_ids:
|
348 |
+
file_name = f"file_{doc_id}.html" # Map numeric ID back to the original file name
|
349 |
+
file_path = os.path.join(folder_path, file_name)
|
350 |
if os.path.exists(file_path):
|
351 |
with open(file_path, "r") as f:
|
352 |
document_texts.append(f.read())
|
|
|
754 |
recipes = []
|
755 |
for score, doc_id, text in scored_documents:
|
756 |
# Retrieve metadata for the document
|
757 |
+
doc_info = metadata_df[metadata_df["id"] == int(doc_id)] # Match numeric ID
|
758 |
if not doc_info.empty:
|
759 |
+
title = doc_info.iloc[0]["original_file_name"] if "original_file_name" in doc_info.columns else "Unknown Title"
|
760 |
+
recipes.append({
|
761 |
+
"id": doc_id,
|
762 |
+
"title": title,
|
763 |
+
"content_preview": text[:200], # First 200 characters
|
764 |
+
"score": score,
|
|
|
765 |
})
|
766 |
|
767 |
|
768 |
+
|
769 |
# Limit the response to top 5 recipes
|
770 |
return {"recipes": recipes[:5], "success": True}
|
771 |
|