Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -336,48 +336,6 @@ def retrieve_document_texts(doc_ids, folder_path='downloaded_articles/downloaded
|
|
336 |
texts.append("")
|
337 |
return texts
|
338 |
|
339 |
-
def retrieve_recipes_texts(doc_ids, zip_path='pdf kb.zip'):
|
340 |
-
texts = []
|
341 |
-
|
342 |
-
try:
|
343 |
-
# Check if the .zip file exists
|
344 |
-
if not os.path.exists(zip_path):
|
345 |
-
print(f"Error: Zip file not found at '{zip_path}'")
|
346 |
-
return ["" for _ in doc_ids]
|
347 |
-
|
348 |
-
# Create a temporary directory to extract the .zip contents
|
349 |
-
with tempfile.TemporaryDirectory() as temp_dir:
|
350 |
-
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
351 |
-
zip_ref.extractall(temp_dir) # Extract all files to the temp directory
|
352 |
-
|
353 |
-
# Iterate through the document IDs
|
354 |
-
for doc_id in doc_ids:
|
355 |
-
# Construct the expected PDF file path
|
356 |
-
pdf_path = os.path.join(temp_dir, f"{doc_id}.pdf")
|
357 |
-
try:
|
358 |
-
# Check if the PDF file exists
|
359 |
-
if not os.path.exists(pdf_path):
|
360 |
-
print(f"Warning: PDF file not found: {pdf_path}")
|
361 |
-
texts.append("")
|
362 |
-
continue
|
363 |
-
|
364 |
-
# Read and extract text from the PDF
|
365 |
-
with open(pdf_path, 'rb') as pdf_file:
|
366 |
-
reader = PdfReader(pdf_file)
|
367 |
-
pdf_text = ""
|
368 |
-
for page in reader.pages:
|
369 |
-
pdf_text += page.extract_text()
|
370 |
-
|
371 |
-
# Add the extracted text to the result list
|
372 |
-
texts.append(pdf_text.strip())
|
373 |
-
except Exception as e:
|
374 |
-
print(f"Error retrieving text from document {doc_id}: {e}")
|
375 |
-
texts.append("")
|
376 |
-
|
377 |
-
except Exception as e:
|
378 |
-
print(f"Error handling zip file: {e}")
|
379 |
-
return ["" for _ in doc_ids]
|
380 |
-
return texts
|
381 |
|
382 |
def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
|
383 |
try:
|
@@ -759,7 +717,7 @@ async def recipes_endpoint(profile: MedicalProfile):
|
|
759 |
document_ids = [doc_id for doc_id, _ in initial_results]
|
760 |
|
761 |
# Retrieve document texts
|
762 |
-
document_texts =
|
763 |
if not document_texts:
|
764 |
raise ValueError("Failed to retrieve document texts.")
|
765 |
|
|
|
336 |
texts.append("")
|
337 |
return texts
|
338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
|
340 |
def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
|
341 |
try:
|
|
|
717 |
document_ids = [doc_id for doc_id, _ in initial_results]
|
718 |
|
719 |
# Retrieve document texts
|
720 |
+
document_texts = retrieve_document_texts(document_ids, folder_path)
|
721 |
if not document_texts:
|
722 |
raise ValueError("Failed to retrieve document texts.")
|
723 |
|