Update app.py
Browse files
app.py
CHANGED
@@ -19,6 +19,7 @@ import time
|
|
19 |
from transformers import AutoTokenizer
|
20 |
from functools import lru_cache
|
21 |
import numpy as np
|
|
|
22 |
|
23 |
# Configure logging
|
24 |
logging.basicConfig(level=logging.INFO)
|
@@ -224,38 +225,53 @@ def log_system_usage(tag=""):
|
|
224 |
logger.error(f"[{tag}] Monitor failed: {e}")
|
225 |
|
226 |
def clean_response(text: str) -> str:
|
227 |
-
"""Enhanced response cleaning with aggressive
|
228 |
if not text:
|
229 |
return ""
|
230 |
|
231 |
# Pre-compiled regex patterns for cleaning
|
232 |
patterns = [
|
233 |
(re.compile(r"\[.*?\]|\bNone\b", re.IGNORECASE), ""),
|
|
|
234 |
(re.compile(r"To (analyze|proceed).*?medications\.", re.IGNORECASE), ""),
|
235 |
(re.compile(r"Since the previous attempts.*?\.", re.IGNORECASE), ""),
|
236 |
(re.compile(r"I need to.*?results\.", re.IGNORECASE), ""),
|
237 |
-
(re.compile(r"(Therefore, )?I will start by retrieving.*?\.", re.IGNORECASE), ""),
|
238 |
-
(re.compile(r"(Therefore, )?Retrieving tools.*?\.", re.IGNORECASE), ""),
|
239 |
(re.compile(r"This requires reviewing.*?\.", re.IGNORECASE), ""),
|
240 |
(re.compile(r"Given the context, it is important to review.*?\.", re.IGNORECASE), ""),
|
|
|
241 |
(re.compile(r"\s+"), " "),
|
242 |
(re.compile(r"[^\w\s\.\,\(\)\-]"), ""),
|
243 |
-
(re.compile(r"(No missed diagnoses identified\.)\s*\1+", re.IGNORECASE), r"\1"),
|
244 |
]
|
245 |
|
246 |
for pattern, repl in patterns:
|
247 |
text = pattern.sub(repl, text)
|
248 |
|
249 |
-
# Deduplicate identical sentences
|
250 |
sentences = text.split(". ")
|
|
|
251 |
seen = set()
|
252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
text = ". ".join(unique_sentences).strip()
|
254 |
|
255 |
return text if text else "No missed diagnoses identified."
|
256 |
|
257 |
def summarize_findings(combined_response: str) -> str:
|
258 |
-
"""Enhanced findings summarization for a single,
|
259 |
if not combined_response:
|
260 |
return "No missed diagnoses were identified in the provided records."
|
261 |
|
@@ -352,7 +368,7 @@ def init_agent():
|
|
352 |
def create_ui(agent):
|
353 |
"""Optimized UI creation with pre-compiled templates"""
|
354 |
PROMPT_TEMPLATE = """
|
355 |
-
Analyze the patient record excerpt for missed diagnoses, focusing
|
356 |
Patient Record Excerpt (Chunk {0} of {1}):
|
357 |
{chunk}
|
358 |
"""
|
@@ -390,7 +406,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
390 |
extracted.extend(cache[cache_key])
|
391 |
else:
|
392 |
result = process_file_cached(f.name, file_type)
|
393 |
-
cache[
|
394 |
extracted.extend(result)
|
395 |
|
396 |
file_hash_value = file_hash(files[0].name) if files else ""
|
@@ -409,6 +425,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
409 |
|
410 |
combined_response = ""
|
411 |
report_path = None
|
|
|
412 |
|
413 |
try:
|
414 |
# Process in optimized batches
|
@@ -456,7 +473,15 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
456 |
content = clean_response(chunk_output)
|
457 |
|
458 |
if content and content != "No missed diagnoses identified.":
|
459 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
|
461 |
if chunk_response:
|
462 |
combined_response += f"--- Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{chunk_response.strip()}\n"
|
|
|
19 |
from transformers import AutoTokenizer
|
20 |
from functools import lru_cache
|
21 |
import numpy as np
|
22 |
+
from difflib import SequenceMatcher
|
23 |
|
24 |
# Configure logging
|
25 |
logging.basicConfig(level=logging.INFO)
|
|
|
225 |
logger.error(f"[{tag}] Monitor failed: {e}")
|
226 |
|
227 |
def clean_response(text: str) -> str:
|
228 |
+
"""Enhanced response cleaning with aggressive deduplication"""
|
229 |
if not text:
|
230 |
return ""
|
231 |
|
232 |
# Pre-compiled regex patterns for cleaning
|
233 |
patterns = [
|
234 |
(re.compile(r"\[.*?\]|\bNone\b", re.IGNORECASE), ""),
|
235 |
+
(re.compile(r"(The patient record excerpt provides|Patient record excerpt contains).*?(John Doe|general information).*?\.", re.IGNORECASE), ""),
|
236 |
(re.compile(r"To (analyze|proceed).*?medications\.", re.IGNORECASE), ""),
|
237 |
(re.compile(r"Since the previous attempts.*?\.", re.IGNORECASE), ""),
|
238 |
(re.compile(r"I need to.*?results\.", re.IGNORECASE), ""),
|
239 |
+
(re.compile(r"(Therefore, )?(Retrieving|I will start by retrieving) tools.*?\.", re.IGNORECASE), ""),
|
|
|
240 |
(re.compile(r"This requires reviewing.*?\.", re.IGNORECASE), ""),
|
241 |
(re.compile(r"Given the context, it is important to review.*?\.", re.IGNORECASE), ""),
|
242 |
+
(re.compile(r"Final Analysis\s*", re.IGNORECASE), ""),
|
243 |
(re.compile(r"\s+"), " "),
|
244 |
(re.compile(r"[^\w\s\.\,\(\)\-]"), ""),
|
245 |
+
(re.compile(r"(No missed diagnoses identified\.)\s*\1+", re.IGNORECASE), r"\1"),
|
246 |
]
|
247 |
|
248 |
for pattern, repl in patterns:
|
249 |
text = pattern.sub(repl, text)
|
250 |
|
251 |
+
# Deduplicate near-identical sentences using similarity threshold
|
252 |
sentences = text.split(". ")
|
253 |
+
unique_sentences = []
|
254 |
seen = set()
|
255 |
+
|
256 |
+
for s in sentences:
|
257 |
+
if not s:
|
258 |
+
continue
|
259 |
+
# Check similarity with existing sentences
|
260 |
+
is_unique = True
|
261 |
+
for seen_s in seen:
|
262 |
+
if SequenceMatcher(None, s.lower(), seen_s.lower()).ratio() > 0.9:
|
263 |
+
is_unique = False
|
264 |
+
break
|
265 |
+
if is_unique:
|
266 |
+
unique_sentences.append(s)
|
267 |
+
seen.add(s)
|
268 |
+
|
269 |
text = ". ".join(unique_sentences).strip()
|
270 |
|
271 |
return text if text else "No missed diagnoses identified."
|
272 |
|
273 |
def summarize_findings(combined_response: str) -> str:
|
274 |
+
"""Enhanced findings summarization for a single, concise paragraph"""
|
275 |
if not combined_response:
|
276 |
return "No missed diagnoses were identified in the provided records."
|
277 |
|
|
|
368 |
def create_ui(agent):
|
369 |
"""Optimized UI creation with pre-compiled templates"""
|
370 |
PROMPT_TEMPLATE = """
|
371 |
+
Analyze the patient record excerpt for missed diagnoses, focusing ONLY on clinical findings such as symptoms, medications, or evaluation results provided in the excerpt. Provide a concise, evidence-based summary in ONE paragraph without headings, bullet points, or repeating non-clinical data (e.g., name, date of birth, allergies). Include specific findings (e.g., 'elevated blood pressure (160/95)'), their implications (e.g., 'may indicate untreated hypertension'), and recommend urgent review. Treat medications or psychiatric evaluations as potential missed diagnoses. Do NOT use external tools, retrieve additional data, or summarize non-clinical information. If no clinical findings are present, state 'No missed diagnoses identified' in ONE sentence. Ignore other oversight categories (e.g., medication conflicts).
|
372 |
Patient Record Excerpt (Chunk {0} of {1}):
|
373 |
{chunk}
|
374 |
"""
|
|
|
406 |
extracted.extend(cache[cache_key])
|
407 |
else:
|
408 |
result = process_file_cached(f.name, file_type)
|
409 |
+
cache[key] = result
|
410 |
extracted.extend(result)
|
411 |
|
412 |
file_hash_value = file_hash(files[0].name) if files else ""
|
|
|
425 |
|
426 |
combined_response = ""
|
427 |
report_path = None
|
428 |
+
seen_responses = set() # Track unique responses to avoid repetition
|
429 |
|
430 |
try:
|
431 |
# Process in optimized batches
|
|
|
473 |
content = clean_response(chunk_output)
|
474 |
|
475 |
if content and content != "No missed diagnoses identified.":
|
476 |
+
# Check for near-duplicate responses
|
477 |
+
is_unique = True
|
478 |
+
for seen_response in seen_responses:
|
479 |
+
if SequenceMatcher(None, content.lower(), seen_response.lower()).ratio() > 0.9:
|
480 |
+
is_unique = False
|
481 |
+
break
|
482 |
+
if is_unique:
|
483 |
+
chunk_response += content + " "
|
484 |
+
seen_responses.add(content)
|
485 |
|
486 |
if chunk_response:
|
487 |
combined_response += f"--- Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{chunk_response.strip()}\n"
|