Update app.py
Browse files
app.py
CHANGED
|
@@ -17,9 +17,9 @@ import gc
|
|
| 17 |
from diskcache import Cache
|
| 18 |
import time
|
| 19 |
|
| 20 |
-
# Configure logging
|
| 21 |
-
logging.basicConfig(level=logging.INFO)
|
| 22 |
-
logger = logging.getLogger(
|
| 23 |
|
| 24 |
# Persistent directory
|
| 25 |
persistent_dir = "/data/hf_cache"
|
|
@@ -61,7 +61,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
|
|
| 61 |
with pdfplumber.open(file_path) as pdf:
|
| 62 |
total_pages = len(pdf.pages)
|
| 63 |
if total_pages == 0:
|
| 64 |
-
logger.error("No pages found in PDF")
|
| 65 |
return ""
|
| 66 |
|
| 67 |
batch_size = 10
|
|
@@ -89,13 +89,13 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
|
|
| 89 |
processed_pages += batch_size
|
| 90 |
if progress_callback:
|
| 91 |
progress_callback(min(processed_pages, total_pages), total_pages)
|
| 92 |
-
logger.info("Processed %d/%d pages", min(processed_pages, total_pages), total_pages)
|
| 93 |
|
| 94 |
extracted_text = "\n\n".join(filter(None, text_chunks))
|
| 95 |
-
logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
|
| 96 |
return extracted_text
|
| 97 |
except Exception as e:
|
| 98 |
-
logger.error("PDF processing error: %s", e)
|
| 99 |
return f"PDF processing error: {str(e)}"
|
| 100 |
|
| 101 |
def convert_file_to_json(file_path: str, file_type: str, progress_callback=None) -> str:
|
|
@@ -103,8 +103,10 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
|
|
| 103 |
file_h = file_hash(file_path)
|
| 104 |
cache_key = f"{file_h}_{file_type}"
|
| 105 |
if cache_key in cache:
|
| 106 |
-
logger.info("
|
| 107 |
return cache[cache_key]
|
|
|
|
|
|
|
| 108 |
|
| 109 |
if file_type == "pdf":
|
| 110 |
text = extract_all_pages(file_path, progress_callback)
|
|
@@ -128,7 +130,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
|
|
| 128 |
logger.info("Cached extraction for %s, size: %d bytes", file_path, len(result))
|
| 129 |
return result
|
| 130 |
except Exception as e:
|
| 131 |
-
logger.error("Error processing %s: %s", os.path.basename(file_path), e)
|
| 132 |
return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
|
| 133 |
|
| 134 |
def log_system_usage(tag=""):
|
|
@@ -259,28 +261,42 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
| 259 |
yield history, None, ""
|
| 260 |
logger.info("Starting analysis for message: %s", message[:100])
|
| 261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
extracted = ""
|
| 263 |
file_hash_value = ""
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
|
| 281 |
logger.info("Extracted text length: %d chars", len(extracted))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
chunk_size = 6000
|
| 283 |
chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
|
|
|
|
|
|
|
| 284 |
logger.info("Created %d chunks", len(chunks))
|
| 285 |
combined_response = ""
|
| 286 |
batch_size = 2
|
|
@@ -337,7 +353,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
| 337 |
yield history, report_path if report_path and os.path.exists(report_path) else None, summary
|
| 338 |
|
| 339 |
except Exception as e:
|
| 340 |
-
logger.error("Analysis error: %s", e)
|
| 341 |
history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
|
| 342 |
yield history, None, f"### Summary of Clinical Oversights\nError occurred during analysis: {str(e)}"
|
| 343 |
|
|
|
|
| 17 |
from diskcache import Cache
|
| 18 |
import time
|
| 19 |
|
| 20 |
+
# Configure logging with a more specific logger name to avoid conflicts
|
| 21 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 22 |
+
logger = logging.getLogger("ClinicalOversightApp")
|
| 23 |
|
| 24 |
# Persistent directory
|
| 25 |
persistent_dir = "/data/hf_cache"
|
|
|
|
| 61 |
with pdfplumber.open(file_path) as pdf:
|
| 62 |
total_pages = len(pdf.pages)
|
| 63 |
if total_pages == 0:
|
| 64 |
+
logger.error("No pages found in PDF: %s", file_path)
|
| 65 |
return ""
|
| 66 |
|
| 67 |
batch_size = 10
|
|
|
|
| 89 |
processed_pages += batch_size
|
| 90 |
if progress_callback:
|
| 91 |
progress_callback(min(processed_pages, total_pages), total_pages)
|
| 92 |
+
logger.info("Processed %d/%d pages for %s", min(processed_pages, total_pages), total_pages, file_path)
|
| 93 |
|
| 94 |
extracted_text = "\n\n".join(filter(None, text_chunks))
|
| 95 |
+
logger.info("Extracted %d pages from %s, total length: %d chars", total_pages, file_path, len(extracted_text))
|
| 96 |
return extracted_text
|
| 97 |
except Exception as e:
|
| 98 |
+
logger.error("PDF processing error for %s: %s", file_path, e, exc_info=True)
|
| 99 |
return f"PDF processing error: {str(e)}"
|
| 100 |
|
| 101 |
def convert_file_to_json(file_path: str, file_type: str, progress_callback=None) -> str:
|
|
|
|
| 103 |
file_h = file_hash(file_path)
|
| 104 |
cache_key = f"{file_h}_{file_type}"
|
| 105 |
if cache_key in cache:
|
| 106 |
+
logger.info("Cache hit for %s (key: %s)", file_path, cache_key)
|
| 107 |
return cache[cache_key]
|
| 108 |
+
else:
|
| 109 |
+
logger.info("Cache miss for %s (key: %s), performing fresh extraction", file_path, cache_key)
|
| 110 |
|
| 111 |
if file_type == "pdf":
|
| 112 |
text = extract_all_pages(file_path, progress_callback)
|
|
|
|
| 130 |
logger.info("Cached extraction for %s, size: %d bytes", file_path, len(result))
|
| 131 |
return result
|
| 132 |
except Exception as e:
|
| 133 |
+
logger.error("Error processing %s: %s", os.path.basename(file_path), e, exc_info=True)
|
| 134 |
return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
|
| 135 |
|
| 136 |
def log_system_usage(tag=""):
|
|
|
|
| 261 |
yield history, None, ""
|
| 262 |
logger.info("Starting analysis for message: %s", message[:100])
|
| 263 |
|
| 264 |
+
if not files:
|
| 265 |
+
logger.error("No files uploaded for analysis")
|
| 266 |
+
history.append({"role": "assistant", "content": "❌ Please upload a file to analyze."})
|
| 267 |
+
yield history, None, "### Summary of Clinical Oversights\nNo file uploaded for analysis."
|
| 268 |
+
return
|
| 269 |
+
|
| 270 |
extracted = ""
|
| 271 |
file_hash_value = ""
|
| 272 |
+
logger.info("Processing %d uploaded files", len(files))
|
| 273 |
+
for f in files:
|
| 274 |
+
logger.info("Processing file: %s", f.name)
|
| 275 |
+
|
| 276 |
+
def update_extraction_progress(current, total):
|
| 277 |
+
progress(current / total, desc=f"Extracting text... Page {current}/{total}")
|
| 278 |
+
return history, None, ""
|
| 279 |
+
|
| 280 |
+
with ThreadPoolExecutor(max_workers=6) as executor:
|
| 281 |
+
futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
|
| 282 |
+
results = [sanitize_utf8(f.result()) for f in as_completed(futures)]
|
| 283 |
+
extracted = "\n".join(results)
|
| 284 |
+
file_hash_value = file_hash(files[0].name) if files else ""
|
| 285 |
+
logger.info("Extraction complete for %d files", len(files))
|
| 286 |
+
history.append({"role": "assistant", "content": "✅ Text extraction complete."})
|
| 287 |
+
yield history, None, ""
|
| 288 |
|
| 289 |
logger.info("Extracted text length: %d chars", len(extracted))
|
| 290 |
+
if len(extracted.strip()) == 0:
|
| 291 |
+
logger.error("Extracted text is empty")
|
| 292 |
+
history.append({"role": "assistant", "content": "❌ Extracted text is empty. Please ensure the file contains readable content."})
|
| 293 |
+
yield history, None, "### Summary of Clinical Oversights\nExtracted text is empty."
|
| 294 |
+
return
|
| 295 |
+
|
| 296 |
chunk_size = 6000
|
| 297 |
chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
|
| 298 |
+
if not chunks:
|
| 299 |
+
chunks = [""]
|
| 300 |
logger.info("Created %d chunks", len(chunks))
|
| 301 |
combined_response = ""
|
| 302 |
batch_size = 2
|
|
|
|
| 353 |
yield history, report_path if report_path and os.path.exists(report_path) else None, summary
|
| 354 |
|
| 355 |
except Exception as e:
|
| 356 |
+
logger.error("Analysis error: %s", e, exc_info=True)
|
| 357 |
history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
|
| 358 |
yield history, None, f"### Summary of Clinical Oversights\nError occurred during analysis: {str(e)}"
|
| 359 |
|