Ali2206 commited on
Commit
f5365bc
·
verified ·
1 Parent(s): fa0b058

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -10
app.py CHANGED
@@ -19,6 +19,7 @@ import time
19
  from transformers import AutoTokenizer
20
  from functools import lru_cache
21
  import numpy as np
 
22
 
23
  # Configure logging
24
  logging.basicConfig(level=logging.INFO)
@@ -224,38 +225,53 @@ def log_system_usage(tag=""):
224
  logger.error(f"[{tag}] Monitor failed: {e}")
225
 
226
  def clean_response(text: str) -> str:
227
- """Enhanced response cleaning with aggressive artifact removal"""
228
  if not text:
229
  return ""
230
 
231
  # Pre-compiled regex patterns for cleaning
232
  patterns = [
233
  (re.compile(r"\[.*?\]|\bNone\b", re.IGNORECASE), ""),
 
234
  (re.compile(r"To (analyze|proceed).*?medications\.", re.IGNORECASE), ""),
235
  (re.compile(r"Since the previous attempts.*?\.", re.IGNORECASE), ""),
236
  (re.compile(r"I need to.*?results\.", re.IGNORECASE), ""),
237
- (re.compile(r"(Therefore, )?I will start by retrieving.*?\.", re.IGNORECASE), ""),
238
- (re.compile(r"(Therefore, )?Retrieving tools.*?\.", re.IGNORECASE), ""),
239
  (re.compile(r"This requires reviewing.*?\.", re.IGNORECASE), ""),
240
  (re.compile(r"Given the context, it is important to review.*?\.", re.IGNORECASE), ""),
 
241
  (re.compile(r"\s+"), " "),
242
  (re.compile(r"[^\w\s\.\,\(\)\-]"), ""),
243
- (re.compile(r"(No missed diagnoses identified\.)\s*\1+", re.IGNORECASE), r"\1"), # Deduplicate
244
  ]
245
 
246
  for pattern, repl in patterns:
247
  text = pattern.sub(repl, text)
248
 
249
- # Deduplicate identical sentences
250
  sentences = text.split(". ")
 
251
  seen = set()
252
- unique_sentences = [s for s in sentences if s and not (s in seen or seen.add(s))]
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  text = ". ".join(unique_sentences).strip()
254
 
255
  return text if text else "No missed diagnoses identified."
256
 
257
  def summarize_findings(combined_response: str) -> str:
258
- """Enhanced findings summarization for a single, detailed paragraph"""
259
  if not combined_response:
260
  return "No missed diagnoses were identified in the provided records."
261
 
@@ -352,7 +368,7 @@ def init_agent():
352
  def create_ui(agent):
353
  """Optimized UI creation with pre-compiled templates"""
354
  PROMPT_TEMPLATE = """
355
- Analyze the patient record excerpt for missed diagnoses, focusing only on clinical findings such as symptoms, medications, or evaluation results provided in the excerpt. Provide a concise, evidence-based summary in one paragraph without headings or bullet points. Include specific findings (e.g., 'elevated blood pressure (160/95) on page 10'), their implications (e.g., 'may indicate untreated hypertension'), and recommend urgent review. Do NOT use external tools or retrieve additional data. If no missed diagnoses are found, state 'No missed diagnoses identified' in one sentence. Ignore non-clinical data (e.g., name, date of birth) and other oversight categories (e.g., medication conflicts).
356
  Patient Record Excerpt (Chunk {0} of {1}):
357
  {chunk}
358
  """
@@ -390,7 +406,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
390
  extracted.extend(cache[cache_key])
391
  else:
392
  result = process_file_cached(f.name, file_type)
393
- cache[cache_key] = result
394
  extracted.extend(result)
395
 
396
  file_hash_value = file_hash(files[0].name) if files else ""
@@ -409,6 +425,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
409
 
410
  combined_response = ""
411
  report_path = None
 
412
 
413
  try:
414
  # Process in optimized batches
@@ -456,7 +473,15 @@ Patient Record Excerpt (Chunk {0} of {1}):
456
  content = clean_response(chunk_output)
457
 
458
  if content and content != "No missed diagnoses identified.":
459
- chunk_response += content + " "
 
 
 
 
 
 
 
 
460
 
461
  if chunk_response:
462
  combined_response += f"--- Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{chunk_response.strip()}\n"
 
19
  from transformers import AutoTokenizer
20
  from functools import lru_cache
21
  import numpy as np
22
+ from difflib import SequenceMatcher
23
 
24
  # Configure logging
25
  logging.basicConfig(level=logging.INFO)
 
225
  logger.error(f"[{tag}] Monitor failed: {e}")
226
 
227
  def clean_response(text: str) -> str:
228
+ """Enhanced response cleaning with aggressive deduplication"""
229
  if not text:
230
  return ""
231
 
232
  # Pre-compiled regex patterns for cleaning
233
  patterns = [
234
  (re.compile(r"\[.*?\]|\bNone\b", re.IGNORECASE), ""),
235
+ (re.compile(r"(The patient record excerpt provides|Patient record excerpt contains).*?(John Doe|general information).*?\.", re.IGNORECASE), ""),
236
  (re.compile(r"To (analyze|proceed).*?medications\.", re.IGNORECASE), ""),
237
  (re.compile(r"Since the previous attempts.*?\.", re.IGNORECASE), ""),
238
  (re.compile(r"I need to.*?results\.", re.IGNORECASE), ""),
239
+ (re.compile(r"(Therefore, )?(Retrieving|I will start by retrieving) tools.*?\.", re.IGNORECASE), ""),
 
240
  (re.compile(r"This requires reviewing.*?\.", re.IGNORECASE), ""),
241
  (re.compile(r"Given the context, it is important to review.*?\.", re.IGNORECASE), ""),
242
+ (re.compile(r"Final Analysis\s*", re.IGNORECASE), ""),
243
  (re.compile(r"\s+"), " "),
244
  (re.compile(r"[^\w\s\.\,\(\)\-]"), ""),
245
+ (re.compile(r"(No missed diagnoses identified\.)\s*\1+", re.IGNORECASE), r"\1"),
246
  ]
247
 
248
  for pattern, repl in patterns:
249
  text = pattern.sub(repl, text)
250
 
251
+ # Deduplicate near-identical sentences using similarity threshold
252
  sentences = text.split(". ")
253
+ unique_sentences = []
254
  seen = set()
255
+
256
+ for s in sentences:
257
+ if not s:
258
+ continue
259
+ # Check similarity with existing sentences
260
+ is_unique = True
261
+ for seen_s in seen:
262
+ if SequenceMatcher(None, s.lower(), seen_s.lower()).ratio() > 0.9:
263
+ is_unique = False
264
+ break
265
+ if is_unique:
266
+ unique_sentences.append(s)
267
+ seen.add(s)
268
+
269
  text = ". ".join(unique_sentences).strip()
270
 
271
  return text if text else "No missed diagnoses identified."
272
 
273
  def summarize_findings(combined_response: str) -> str:
274
+ """Enhanced findings summarization for a single, concise paragraph"""
275
  if not combined_response:
276
  return "No missed diagnoses were identified in the provided records."
277
 
 
368
  def create_ui(agent):
369
  """Optimized UI creation with pre-compiled templates"""
370
  PROMPT_TEMPLATE = """
371
+ Analyze the patient record excerpt for missed diagnoses, focusing ONLY on clinical findings such as symptoms, medications, or evaluation results provided in the excerpt. Provide a concise, evidence-based summary in ONE paragraph without headings, bullet points, or repeating non-clinical data (e.g., name, date of birth, allergies). Include specific findings (e.g., 'elevated blood pressure (160/95)'), their implications (e.g., 'may indicate untreated hypertension'), and recommend urgent review. Treat medications or psychiatric evaluations as potential missed diagnoses. Do NOT use external tools, retrieve additional data, or summarize non-clinical information. If no clinical findings are present, state 'No missed diagnoses identified' in ONE sentence. Ignore other oversight categories (e.g., medication conflicts).
372
  Patient Record Excerpt (Chunk {0} of {1}):
373
  {chunk}
374
  """
 
406
  extracted.extend(cache[cache_key])
407
  else:
408
  result = process_file_cached(f.name, file_type)
409
+ cache[key] = result
410
  extracted.extend(result)
411
 
412
  file_hash_value = file_hash(files[0].name) if files else ""
 
425
 
426
  combined_response = ""
427
  report_path = None
428
+ seen_responses = set() # Track unique responses to avoid repetition
429
 
430
  try:
431
  # Process in optimized batches
 
473
  content = clean_response(chunk_output)
474
 
475
  if content and content != "No missed diagnoses identified.":
476
+ # Check for near-duplicate responses
477
+ is_unique = True
478
+ for seen_response in seen_responses:
479
+ if SequenceMatcher(None, content.lower(), seen_response.lower()).ratio() > 0.9:
480
+ is_unique = False
481
+ break
482
+ if is_unique:
483
+ chunk_response += content + " "
484
+ seen_responses.add(content)
485
 
486
  if chunk_response:
487
  combined_response += f"--- Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{chunk_response.strip()}\n"