Ali2206 commited on
Commit
dda4a06
·
verified ·
1 Parent(s): 5601da1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -249
app.py CHANGED
@@ -27,11 +27,11 @@ logger = logging.getLogger(__name__)
27
 
28
  # Constants
29
  MAX_TOKENS = 1800
30
- BATCH_SIZE = 1 # Reduced to minimize memory pressure
31
  MAX_WORKERS = 2
32
- CHUNK_SIZE = 5 # Smaller chunks for PDF processing
33
  MODEL_MAX_TOKENS = 131072
34
- MAX_TEXT_LENGTH = 500000 # Limit raw text length before tokenization
35
 
36
  # Persistent directory setup
37
  persistent_dir = "/data/hf_cache"
@@ -63,17 +63,14 @@ from txagent.txagent import TxAgent
63
  # Initialize cache with 10GB limit
64
  cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
65
 
66
- # Initialize tokenizer for precise chunking (with caching)
67
  @lru_cache(maxsize=1)
68
  def get_tokenizer():
69
  return AutoTokenizer.from_pretrained("mims-harvard/TxAgent-T1-Llama-3.1-8B")
70
 
71
  def sanitize_utf8(text: str) -> str:
72
- """Optimized UTF-8 sanitization"""
73
  return text.encode("utf-8", "ignore").decode("utf-8")
74
 
75
  def file_hash(path: str) -> str:
76
- """Optimized file hashing with buffer reading"""
77
  hash_md5 = hashlib.md5()
78
  with open(path, "rb") as f:
79
  for chunk in iter(lambda: f.read(4096), b""):
@@ -81,12 +78,10 @@ def file_hash(path: str) -> str:
81
  return hash_md5.hexdigest()
82
 
83
  def extract_pdf_page(page, tokenizer, max_tokens=MAX_TOKENS) -> List[str]:
84
- """Extract and chunk a single page with token limit"""
85
  try:
86
  text = page.extract_text() or ""
87
  text = sanitize_utf8(text)
88
- if len(text) > MAX_TEXT_LENGTH // 10: # Per-page text limit
89
- logger.warning(f"Page {page.page_number} text too long ({len(text)}). Truncating.")
90
  text = text[:MAX_TEXT_LENGTH // 10]
91
 
92
  tokens = tokenizer.encode(text, add_special_tokens=False)
@@ -111,7 +106,6 @@ def extract_pdf_page(page, tokenizer, max_tokens=MAX_TOKENS) -> List[str]:
111
  return []
112
 
113
  def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
114
- """Extract PDF pages with early token-based chunking"""
115
  try:
116
  tokenizer = get_tokenizer()
117
  with pdfplumber.open(file_path) as pdf:
@@ -134,7 +128,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
134
  for chunk in page_chunks:
135
  chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
136
  if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
137
- logger.warning(f"Total tokens ({total_tokens + chunk_tokens}) exceed model limit ({MODEL_MAX_TOKENS}). Stopping.")
138
  return results
139
  results.append(chunk)
140
  total_tokens += chunk_tokens
@@ -151,31 +145,56 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
151
  return [f"PDF processing error: {str(e)}"]
152
 
153
  def excel_to_json(file_path: str) -> List[Dict]:
154
- """Optimized Excel processing with chunking"""
155
  try:
156
- for engine in ['openpyxl', 'xlrd']:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  try:
158
- df = pd.read_excel(
159
- file_path,
160
- engine=engine,
161
- header=None,
162
- dtype=str,
163
- na_filter=False
164
- )
165
- return [{
166
- "filename": os.path.basename(file_path),
167
- "rows": df.values.tolist(),
168
- "type": "excel"
169
- }]
170
- except Exception:
171
- continue
172
- raise Exception("No suitable Excel engine found")
 
 
 
 
 
 
173
  except Exception as e:
174
- logger.error(f"Excel processing error: {e}")
175
- return [{"error": f"Excel processing error: {str(e)}"}]
176
 
177
  def csv_to_json(file_path: str) -> List[Dict]:
178
- """Optimized CSV processing with chunking"""
179
  try:
180
  chunks = []
181
  for chunk in pd.read_csv(
@@ -201,7 +220,6 @@ def csv_to_json(file_path: str) -> List[Dict]:
201
 
202
  @lru_cache(maxsize=100)
203
  def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
204
- """Cached file processing with memory optimization"""
205
  try:
206
  if file_type == "pdf":
207
  chunks = extract_all_pages(file_path)
@@ -218,80 +236,17 @@ def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
218
  else:
219
  return [{"error": f"Unsupported file type: {file_type}"}]
220
  except Exception as e:
221
- logger.error(f"Error processing {os.path.basename(file_path)}: {e}")
222
- return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
223
-
224
- def tokenize_and_chunk(text: str, max_tokens: int = MAX_TOKENS) -> List[str]:
225
- """Optimized tokenization and chunking with early validation"""
226
- if len(text) > MAX_TEXT_LENGTH:
227
- logger.warning(f"Text length ({len(text)}) exceeds limit ({MAX_TEXT_LENGTH}). Truncating.")
228
- text = text[:MAX_TEXT_LENGTH]
229
-
230
- tokenizer = get_tokenizer()
231
- tokens = tokenizer.encode(text, add_special_tokens=False)
232
- if len(tokens) > MODEL_MAX_TOKENS:
233
- logger.error(f"Token count ({len(tokens)}) exceeds model limit ({MODEL_MAX_TOKENS}).")
234
- return [text[:MAX_TEXT_LENGTH // 10]] # Fallback to small chunk
235
-
236
- chunks = []
237
- current_chunk = []
238
- current_length = 0
239
-
240
- for token in tokens:
241
- if current_length + 1 > max_tokens:
242
- chunks.append(tokenizer.decode(current_chunk))
243
- current_chunk = [token]
244
- current_length = 1
245
- else:
246
- current_chunk.append(token)
247
- current_length += 1
248
-
249
- if current_chunk:
250
- chunks.append(tokenizer.decode(current_chunk))
251
-
252
- return chunks
253
-
254
- def log_system_usage(tag=""):
255
- """Optimized system monitoring"""
256
- try:
257
- cpu = psutil.cpu_percent(interval=0.5)
258
- mem = psutil.virtual_memory()
259
- logger.info(f"[{tag}] CPU: {cpu:.1f}% | RAM: {mem.used // (1024**2)}MB / {mem.total // (1024**2)}MB")
260
-
261
- try:
262
- result = subprocess.run(
263
- ["nvidia-smi", "--query-gpu=memory.used,memory.total,utilization.gpu", "--format=csv,nounits,noheader"],
264
- capture_output=True,
265
- text=True,
266
- timeout=2
267
- )
268
- if result.returncode == 0:
269
- used, total, util = result.stdout.strip().split(", ")
270
- logger.info(f"[{tag}] GPU: {used}MB / {total}MB | Utilization: {util}%")
271
- except subprocess.TimeoutExpired:
272
- logger.warning(f"[{tag}] GPU monitoring timed out")
273
- except Exception as e:
274
- logger.error(f"[{tag}] Monitor failed: {e}")
275
 
276
  def clean_response(text: str) -> str:
277
- """Enhanced response cleaning with aggressive deduplication"""
278
  if not text:
279
  return ""
280
 
281
  patterns = [
282
  (re.compile(r"\[.*?\]|\bNone\b", re.IGNORECASE), ""),
283
- (re.compile(r"(The patient record excerpt provides|Patient record excerpt contains).*?(John Doe|general information).*?\.", re.IGNORECASE), ""),
284
- (re.compile(r"To (analyze|proceed).*?medications\.", re.IGNORECASE), ""),
285
- (re.compile(r"Since the previous attempts.*?\.", re.IGNORECASE), ""),
286
- (re.compile(r"I need to.*?results\.", re.IGNORECASE), ""),
287
- (re.compile(r"(Therefore, )?(Retrieving|I will start by retrieving) tools.*?\.", re.IGNORECASE), ""),
288
- (re.compile(r"This requires reviewing.*?\.", re.IGNORECASE), ""),
289
- (re.compile(r"Given the context, it is important to review.*?\.", re.IGNORECASE), ""),
290
- (re.compile(r"Final Analysis\s*", re.IGNORECASE), ""),
291
- (re.compile(r"Therefore, no missed diagnoses can be identified.*?\.", re.IGNORECASE), ""),
292
  (re.compile(r"\s+"), " "),
293
  (re.compile(r"[^\w\s\.\,\(\)\-]"), ""),
294
- (re.compile(r"(No missed diagnoses identified\.)\s*\1+", re.IGNORECASE), r"\1"),
295
  ]
296
 
297
  for pattern, repl in patterns:
@@ -314,74 +269,11 @@ def clean_response(text: str) -> str:
314
  seen.add(s)
315
 
316
  text = ". ".join(unique_sentences).strip()
317
-
318
  return text if text else "No missed diagnoses identified."
319
 
320
- def summarize_findings(combined_response: str) -> str:
321
- """Enhanced findings summarization for a single, concise paragraph"""
322
- if not combined_response:
323
- return "No missed diagnoses were identified in the provided records."
324
-
325
- diagnosis_pattern = re.compile(r"-\s*(.+)$")
326
- section_pattern = re.compile(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)")
327
- no_issues_pattern = re.compile(r"No issues identified|No missed diagnoses identified", re.IGNORECASE)
328
-
329
- diagnoses = []
330
- current_section = None
331
-
332
- for line in combined_response.splitlines():
333
- line = line.strip()
334
- if not line:
335
- continue
336
-
337
- section_match = section_pattern.match(line)
338
- if section_match:
339
- current_section = "diagnoses" if section_match.group(1) == "Missed Diagnoses" else None
340
- continue
341
-
342
- if current_section == "diagnoses":
343
- diagnosis_match = diagnosis_pattern.match(line)
344
- if diagnosis_match and not no_issues_pattern.search(line):
345
- diagnosis = diagnosis_match.group(1).strip()
346
- if diagnosis:
347
- diagnoses.append(diagnosis)
348
-
349
- medication_pattern = re.compile(r"medications includ(?:e|ing|ed) ([^\.]+)", re.IGNORECASE)
350
- evaluation_pattern = re.compile(r"psychiatric evaluation.*?mention of ([^\.]+)", re.IGNORECASE)
351
-
352
- for line in combined_response.splitlines():
353
- line = line.strip()
354
- if not line or no_issues_pattern.search(line):
355
- continue
356
-
357
- med_match = medication_pattern.search(line)
358
- if med_match:
359
- meds = med_match.group(1).strip()
360
- diagnoses.append(f"use of medications ({meds}), suggesting an undiagnosed psychiatric condition requiring urgent review")
361
-
362
- eval_match = evaluation_pattern.search(line)
363
- if eval_match:
364
- details = eval_match.group(1).strip()
365
- diagnoses.append(f"psychiatric evaluation noting {details}, indicating a potential missed psychiatric diagnosis requiring urgent review")
366
-
367
- if not diagnoses:
368
- return "No missed diagnoses were identified in the provided records."
369
-
370
- seen = set()
371
- unique_diagnoses = [d for d in diagnoses if not (d in seen or seen.add(d))]
372
-
373
- summary = "The patient record indicates missed diagnoses including "
374
- summary += ", ".join(unique_diagnoses[:-1])
375
- summary += f", and {unique_diagnoses[-1]}" if len(unique_diagnoses) > 1 else unique_diagnoses[0]
376
- summary += ". These findings suggest potential oversights in the patient's medical evaluation and require urgent clinical review to prevent adverse outcomes."
377
-
378
- return summary
379
-
380
  @lru_cache(maxsize=1)
381
  def init_agent():
382
- """Cached agent initialization with memory optimization"""
383
  logger.info("Initializing model...")
384
- log_system_usage("Before Load")
385
 
386
  default_tool_path = os.path.abspath("data/new_tool.json")
387
  target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
@@ -399,15 +291,12 @@ def init_agent():
399
  additional_default_tools=[],
400
  )
401
  agent.init_model()
402
-
403
- log_system_usage("After Load")
404
  logger.info("Agent Ready")
405
  return agent
406
 
407
  def create_ui(agent):
408
- """Optimized UI creation with pre-compiled templates"""
409
  PROMPT_TEMPLATE = """
410
- Analyze the patient record excerpt for missed diagnoses, focusing ONLY on clinical findings such as symptoms, medications, or evaluation results provided in the excerpt. Provide a detailed, evidence-based analysis using all available tools (e.g., Tool_RAG, CallAgent) to identify potential oversights. Include specific findings (e.g., 'elevated blood pressure (160/95)'), their implications (e.g., 'may indicate untreated hypertension'), and recommend urgent review. Treat medications or psychiatric evaluations as potential missed diagnoses. Do NOT repeat non-clinical information (e.g., name, date of birth, allergies). If no clinical findings are present, state 'No missed diagnoses identified' in ONE sentence. Ignore other oversight categories (e.g., medication conflicts).
411
  Patient Record Excerpt (Chunk {0} of {1}):
412
  {chunk}
413
  """
@@ -417,8 +306,8 @@ Patient Record Excerpt (Chunk {0} of {1}):
417
 
418
  with gr.Row():
419
  with gr.Column(scale=3):
420
- chatbot = gr.Chatbot(label="Analysis Summary", height=600, type="messages")
421
- msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
422
  send_btn = gr.Button("Analyze", variant="primary")
423
  file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
424
 
@@ -428,7 +317,6 @@ Patient Record Excerpt (Chunk {0} of {1}):
428
  progress_bar = gr.Progress()
429
 
430
  def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
431
- """Optimized analysis pipeline with quick summary and background report"""
432
  history.append({"role": "user", "content": message})
433
  yield history, None, ""
434
 
@@ -442,31 +330,38 @@ Patient Record Excerpt (Chunk {0} of {1}):
442
 
443
  if cache_key in cache:
444
  extracted.extend(cache[cache_key])
 
 
445
  else:
446
  result = process_file_cached(f.name, file_type)
447
- cache[cache_key] = result
448
- extracted.extend(result)
 
 
 
 
 
 
 
 
449
 
450
  file_hash_value = file_hash(files[0].name) if files else ""
451
- history.append({"role": "assistant", "content": "✅ File processing complete"})
452
- yield history, None, ""
453
 
454
  if not extracted:
455
- history.append({"role": "assistant", "content": "❌ No valid content extracted. Please upload a supported file."})
456
- yield history, None, "No valid content extracted."
 
 
 
 
 
 
457
  return
458
 
459
  combined_response = ""
460
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
461
- seen_responses = set()
462
 
463
  try:
464
- chunks = [item["content"] for item in extracted if "content" in item]
465
- if not chunks:
466
- history.append({"role": "assistant", "content": "❌ No processable content found in the file."})
467
- yield history, None, "No processable content found."
468
- return
469
-
470
  for batch_idx in range(0, len(chunks), BATCH_SIZE):
471
  batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
472
 
@@ -483,7 +378,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
483
  desc=f"Processing batch {(batch_idx // BATCH_SIZE) + 1}/{(len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE}")
484
 
485
  with ThreadPoolExecutor(max_workers=min(BATCH_SIZE, MAX_WORKERS)) as executor:
486
- quick_futures = {
487
  executor.submit(
488
  agent.run_quick_summary,
489
  chunk, 0.2, 256, 1024
@@ -491,74 +386,31 @@ Patient Record Excerpt (Chunk {0} of {1}):
491
  for idx, chunk in enumerate(batch_chunks)
492
  }
493
 
494
- for future in as_completed(quick_futures):
495
- chunk_idx = quick_futures[future]
496
- try:
497
- quick_response = clean_response(future.result())
498
- if quick_response and quick_response != "No missed diagnoses identified.":
499
- is_unique = True
500
- for seen_response in seen_responses:
501
- if SequenceMatcher(None, quick_response.lower(), seen_response.lower()).ratio() > 0.9:
502
- is_unique = False
503
- break
504
- if is_unique:
505
- combined_response += f"--- Quick Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{quick_response}\n"
506
- seen_responses.add(quick_response)
507
- history[-1] = {"role": "assistant", "content": combined_response.strip()}
508
- yield history, None, ""
509
- except Exception as e:
510
- logger.error(f"Quick summary error for chunk {batch_idx + chunk_idx + 1}: {e}")
511
- history[-1] = {"role": "assistant", "content": f"Error processing chunk {batch_idx + chunk_idx + 1}: {str(e)}"}
512
- yield history, None, ""
513
- finally:
514
- del future
515
- torch.cuda.empty_cache()
516
- gc.collect()
517
-
518
- # Start background detailed analysis
519
- with ThreadPoolExecutor(max_workers=min(BATCH_SIZE, MAX_WORKERS)) as executor:
520
- detailed_futures = {
521
- executor.submit(
522
- agent.run_gradio_chat,
523
- prompt, [], 0.2, 512, 2048, False, None, 3, None, 0, None, report_path
524
- ): idx
525
- for idx, prompt in enumerate(batch_prompts)
526
- }
527
-
528
- for future in as_completed(detailed_futures):
529
- chunk_idx = detailed_futures[future]
530
  try:
531
- for chunk_output in future.result():
532
- if isinstance(chunk_output, list):
533
- for msg in chunk_output:
534
- if isinstance(msg, gr.ChatMessage) and msg.content:
535
- cleaned_content = clean_response(msg.content)
536
- if cleaned_content and cleaned_content != "No missed diagnoses identified.":
537
- combined_response += cleaned_content + "\n"
538
- history[-1] = {"role": "assistant", "content": combined_response.strip()}
539
- yield history, report_path, ""
540
  except Exception as e:
541
- logger.error(f"Detailed analysis error for chunk {batch_idx + chunk_idx + 1}: {e}")
542
- history[-1] = {"role": "assistant", "content": f"Error in detailed analysis for chunk {batch_idx + chunk_idx + 1}: {str(e)}"}
543
  yield history, None, ""
544
  finally:
545
  del future
546
  torch.cuda.empty_cache()
547
  gc.collect()
548
 
549
- summary = summarize_findings(combined_response)
550
-
551
- if report_path and os.path.exists(report_path):
552
- history.append({"role": "assistant", "content": "Detailed report ready for download."})
553
- yield history, report_path, summary
554
- else:
555
- history.append({"role": "assistant", "content": "Detailed report still processing."})
556
- yield history, None, summary
557
 
558
  except Exception as e:
559
  logger.error(f"Analysis error: {e}")
560
  history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
561
- yield history, None, f"Error occurred during analysis: {str(e)}"
562
  finally:
563
  torch.cuda.empty_cache()
564
  gc.collect()
@@ -578,22 +430,14 @@ Patient Record Excerpt (Chunk {0} of {1}):
578
 
579
  if __name__ == "__main__":
580
  try:
581
- logger.info("Launching optimized app...")
582
  agent = init_agent()
583
  demo = create_ui(agent)
584
- demo.queue(
585
- api_open=False,
586
- max_size=20
587
- ).launch(
588
  server_name="0.0.0.0",
589
  server_port=7860,
590
- show_error=True,
591
- allowed_paths=[report_dir],
592
- share=False
593
  )
594
  except Exception as e:
595
  logger.error(f"Fatal error: {e}")
596
- raise
597
- finally:
598
- if torch.distributed.is_initialized():
599
- torch.distributed.destroy_process_group()
 
27
 
28
  # Constants
29
  MAX_TOKENS = 1800
30
+ BATCH_SIZE = 1
31
  MAX_WORKERS = 2
32
+ CHUNK_SIZE = 5
33
  MODEL_MAX_TOKENS = 131072
34
+ MAX_TEXT_LENGTH = 500000
35
 
36
  # Persistent directory setup
37
  persistent_dir = "/data/hf_cache"
 
63
  # Initialize cache with 10GB limit
64
  cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
65
 
 
66
  @lru_cache(maxsize=1)
67
  def get_tokenizer():
68
  return AutoTokenizer.from_pretrained("mims-harvard/TxAgent-T1-Llama-3.1-8B")
69
 
70
  def sanitize_utf8(text: str) -> str:
 
71
  return text.encode("utf-8", "ignore").decode("utf-8")
72
 
73
  def file_hash(path: str) -> str:
 
74
  hash_md5 = hashlib.md5()
75
  with open(path, "rb") as f:
76
  for chunk in iter(lambda: f.read(4096), b""):
 
78
  return hash_md5.hexdigest()
79
 
80
  def extract_pdf_page(page, tokenizer, max_tokens=MAX_TOKENS) -> List[str]:
 
81
  try:
82
  text = page.extract_text() or ""
83
  text = sanitize_utf8(text)
84
+ if len(text) > MAX_TEXT_LENGTH // 10:
 
85
  text = text[:MAX_TEXT_LENGTH // 10]
86
 
87
  tokens = tokenizer.encode(text, add_special_tokens=False)
 
106
  return []
107
 
108
  def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
 
109
  try:
110
  tokenizer = get_tokenizer()
111
  with pdfplumber.open(file_path) as pdf:
 
128
  for chunk in page_chunks:
129
  chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
130
  if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
131
+ logger.warning(f"Total tokens exceed model limit. Stopping.")
132
  return results
133
  results.append(chunk)
134
  total_tokens += chunk_tokens
 
145
  return [f"PDF processing error: {str(e)}"]
146
 
147
  def excel_to_json(file_path: str) -> List[Dict]:
 
148
  try:
149
+ # Try with openpyxl first
150
+ try:
151
+ with pd.ExcelFile(file_path, engine='openpyxl') as excel_file:
152
+ sheets = excel_file.sheet_names
153
+ results = []
154
+ for sheet_name in sheets:
155
+ df = pd.read_excel(
156
+ excel_file,
157
+ sheet_name=sheet_name,
158
+ header=None,
159
+ dtype=str,
160
+ na_filter=False
161
+ )
162
+ if not df.empty:
163
+ results.append({
164
+ "filename": f"{os.path.basename(file_path)} - {sheet_name}",
165
+ "rows": df.values.tolist(),
166
+ "type": "excel"
167
+ })
168
+ return results if results else [{"error": "No data found in any sheet"}]
169
+ except Exception as openpyxl_error:
170
+ # Fallback to xlrd
171
  try:
172
+ with pd.ExcelFile(file_path, engine='xlrd') as excel_file:
173
+ sheets = excel_file.sheet_names
174
+ results = []
175
+ for sheet_name in sheets:
176
+ df = pd.read_excel(
177
+ excel_file,
178
+ sheet_name=sheet_name,
179
+ header=None,
180
+ dtype=str,
181
+ na_filter=False
182
+ )
183
+ if not df.empty:
184
+ results.append({
185
+ "filename": f"{os.path.basename(file_path)} - {sheet_name}",
186
+ "rows": df.values.tolist(),
187
+ "type": "excel"
188
+ })
189
+ return results if results else [{"error": "No data found in any sheet"}]
190
+ except Exception as xlrd_error:
191
+ logger.error(f"Excel processing failed: {xlrd_error}")
192
+ return [{"error": f"Excel processing failed: {str(xlrd_error)}"}]
193
  except Exception as e:
194
+ logger.error(f"Excel file opening error: {e}")
195
+ return [{"error": f"Excel file opening error: {str(e)}"}]
196
 
197
  def csv_to_json(file_path: str) -> List[Dict]:
 
198
  try:
199
  chunks = []
200
  for chunk in pd.read_csv(
 
220
 
221
  @lru_cache(maxsize=100)
222
  def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
 
223
  try:
224
  if file_type == "pdf":
225
  chunks = extract_all_pages(file_path)
 
236
  else:
237
  return [{"error": f"Unsupported file type: {file_type}"}]
238
  except Exception as e:
239
+ logger.error(f"Error processing file: {e}")
240
+ return [{"error": f"Error processing file: {str(e)}"}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  def clean_response(text: str) -> str:
 
243
  if not text:
244
  return ""
245
 
246
  patterns = [
247
  (re.compile(r"\[.*?\]|\bNone\b", re.IGNORECASE), ""),
 
 
 
 
 
 
 
 
 
248
  (re.compile(r"\s+"), " "),
249
  (re.compile(r"[^\w\s\.\,\(\)\-]"), ""),
 
250
  ]
251
 
252
  for pattern, repl in patterns:
 
269
  seen.add(s)
270
 
271
  text = ". ".join(unique_sentences).strip()
 
272
  return text if text else "No missed diagnoses identified."
273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  @lru_cache(maxsize=1)
275
  def init_agent():
 
276
  logger.info("Initializing model...")
 
277
 
278
  default_tool_path = os.path.abspath("data/new_tool.json")
279
  target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
 
291
  additional_default_tools=[],
292
  )
293
  agent.init_model()
 
 
294
  logger.info("Agent Ready")
295
  return agent
296
 
297
  def create_ui(agent):
 
298
  PROMPT_TEMPLATE = """
299
+ Analyze the patient record excerpt for missed diagnoses. Provide detailed, evidence-based analysis.
300
  Patient Record Excerpt (Chunk {0} of {1}):
301
  {chunk}
302
  """
 
306
 
307
  with gr.Row():
308
  with gr.Column(scale=3):
309
+ chatbot = gr.Chatbot(label="Analysis Summary", height=600)
310
+ msg_input = gr.Textbox(placeholder="Ask about potential oversights...")
311
  send_btn = gr.Button("Analyze", variant="primary")
312
  file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
313
 
 
317
  progress_bar = gr.Progress()
318
 
319
  def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
 
320
  history.append({"role": "user", "content": message})
321
  yield history, None, ""
322
 
 
330
 
331
  if cache_key in cache:
332
  extracted.extend(cache[cache_key])
333
+ history.append({"role": "assistant", "content": f"Using cached data for {os.path.basename(f.name)}"})
334
+ yield history, None, ""
335
  else:
336
  result = process_file_cached(f.name, file_type)
337
+ if result and not (len(result) == 1 and "error" in result[0]):
338
+ cache[cache_key] = result
339
+ extracted.extend(result)
340
+ history.append({"role": "assistant", "content": f"Processed {os.path.basename(f.name)}"})
341
+ yield history, None, ""
342
+ else:
343
+ error_msg = result[0]["error"] if result else "Unknown error"
344
+ history.append({"role": "assistant", "content": f"Failed to process {os.path.basename(f.name)}: {error_msg}"})
345
+ yield history, None, error_msg
346
+ return
347
 
348
  file_hash_value = file_hash(files[0].name) if files else ""
 
 
349
 
350
  if not extracted:
351
+ history.append({"role": "assistant", "content": "❌ No valid content extracted"})
352
+ yield history, None, "No valid content extracted"
353
+ return
354
+
355
+ chunks = [item["content"] for item in extracted if "content" in item]
356
+ if not chunks:
357
+ history.append({"role": "assistant", "content": "❌ No processable content found"})
358
+ yield history, None, "No processable content found"
359
  return
360
 
361
  combined_response = ""
362
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
 
363
 
364
  try:
 
 
 
 
 
 
365
  for batch_idx in range(0, len(chunks), BATCH_SIZE):
366
  batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
367
 
 
378
  desc=f"Processing batch {(batch_idx // BATCH_SIZE) + 1}/{(len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE}")
379
 
380
  with ThreadPoolExecutor(max_workers=min(BATCH_SIZE, MAX_WORKERS)) as executor:
381
+ futures = {
382
  executor.submit(
383
  agent.run_quick_summary,
384
  chunk, 0.2, 256, 1024
 
386
  for idx, chunk in enumerate(batch_chunks)
387
  }
388
 
389
+ for future in as_completed(futures):
390
+ chunk_idx = futures[future]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  try:
392
+ response = clean_response(future.result())
393
+ if response:
394
+ combined_response += f"--- Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{response}\n"
395
+ history[-1] = {"role": "assistant", "content": combined_response.strip()}
396
+ yield history, None, ""
 
 
 
 
397
  except Exception as e:
398
+ logger.error(f"Chunk processing error: {e}")
399
+ history[-1] = {"role": "assistant", "content": f"Error processing chunk: {str(e)}"}
400
  yield history, None, ""
401
  finally:
402
  del future
403
  torch.cuda.empty_cache()
404
  gc.collect()
405
 
406
+ summary = "Analysis complete. " + ("Download full report below." if report_path and os.path.exists(report_path) else "")
407
+ history.append({"role": "assistant", "content": "Analysis completed successfully"})
408
+ yield history, report_path, summary
 
 
 
 
 
409
 
410
  except Exception as e:
411
  logger.error(f"Analysis error: {e}")
412
  history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
413
+ yield history, None, f"Error occurred: {str(e)}"
414
  finally:
415
  torch.cuda.empty_cache()
416
  gc.collect()
 
430
 
431
  if __name__ == "__main__":
432
  try:
433
+ logger.info("Launching app...")
434
  agent = init_agent()
435
  demo = create_ui(agent)
436
+ demo.queue().launch(
 
 
 
437
  server_name="0.0.0.0",
438
  server_port=7860,
439
+ show_error=True
 
 
440
  )
441
  except Exception as e:
442
  logger.error(f"Fatal error: {e}")
443
+ raise