Ali2206 commited on
Commit
2ce0a4e
·
verified ·
1 Parent(s): 7691fc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -214
app.py CHANGED
@@ -1,10 +1,11 @@
1
  import sys
2
  import os
3
  import pandas as pd
 
4
  import json
5
  import gradio as gr
6
  from typing import List
7
- from concurrent.futures import ThreadPoolExecutor
8
  import hashlib
9
  import shutil
10
  import re
@@ -15,21 +16,9 @@ import torch
15
  import gc
16
  from diskcache import Cache
17
  import time
18
- import asyncio
19
-
20
- # Try importing pypdfium2 and pytesseract, fall back to pdfplumber
21
- try:
22
- import pypdfium2 as pdfium
23
- import pytesseract
24
- from PIL import Image
25
- HAS_PYPDFIUM2 = True
26
- except ImportError:
27
- HAS_PYPDFIUM2 = False
28
- import pdfplumber
29
 
30
  # Configure logging
31
  logging.basicConfig(level=logging.INFO)
32
- logging.getLogger("pdfminer").setLevel(logging.ERROR)
33
  logger = logging.getLogger(__name__)
34
 
35
  # Persistent directory
@@ -67,78 +56,37 @@ def file_hash(path: str) -> str:
67
  with open(path, "rb") as f:
68
  return hashlib.md5(f.read()).hexdigest()
69
 
70
- async def extract_all_pages_async(file_path: str, progress_callback=None, force_ocr=False) -> str:
71
  try:
72
- extracted_text = ""
73
- total_pages = 0
74
- text_chunks = []
75
-
76
  with pdfplumber.open(file_path) as pdf:
77
  total_pages = len(pdf.pages)
78
  if total_pages == 0:
79
- logger.error("No pages found in PDF")
80
  return ""
81
 
82
- def extract_page(i):
83
- page = pdf.pages[i]
84
- text = ""
85
- # Adjust table settings for complex layouts
86
- table_settings = {
87
- "vertical_strategy": "lines",
88
- "horizontal_strategy": "lines",
89
- "explicit_vertical_lines": [],
90
- "explicit_horizontal_lines": [],
91
- "snap_tolerance": 5,
92
- "join_tolerance": 5,
93
- "edge_min_length": 3,
94
- "min_words_vertical": 3,
95
- "min_words_horizontal": 1,
96
- "intersection_tolerance": 5,
97
- }
98
- tables = page.extract_tables(table_settings=table_settings)
99
- if tables:
100
- for table in tables:
101
- table_text = "\n".join(
102
- " | ".join(str(cell) if cell is not None else "" for cell in row)
103
- for row in table if any(cell is not None for cell in row)
104
- )
105
- text += table_text + "\n\n"
106
- logger.debug("Page %d extracted %d tables, text length: %d chars", i + 1, len(tables), len(text))
107
- else:
108
- text = page.extract_text() or ""
109
- logger.debug("Page %d no tables, raw text length: %d chars", i + 1, len(text))
110
-
111
- # Force OCR if text is short or force_ocr is True
112
- if (not text.strip() or len(text) < 100 or force_ocr) and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
113
- try:
114
- logger.info("Attempting OCR for page %d", i + 1)
115
- pdfium_pdf = pdfium.PdfDocument(file_path)
116
- page_bitmap = pdfium_pdf[i].render(scale=2).to_pil()
117
- ocr_text = pytesseract.image_to_string(page_bitmap, lang="eng")
118
- logger.debug("Page %d OCR text length: %d chars", i + 1, len(ocr_text))
119
- text = ocr_text if ocr_text.strip() else text
120
- pdfium_pdf.close()
121
- except Exception as e:
122
- logger.error("OCR failed for page %d: %s", i + 1, e)
123
- return (i, f"=== Page {i + 1} ===\n{text.strip()}")
124
-
125
- with ThreadPoolExecutor(max_workers=4) as executor:
126
- futures = [executor.submit(extract_page, i) for i in range(total_pages)]
127
- for future in futures:
128
- page_num, text = future.result()
129
- text_chunks.append((page_num, text))
130
- logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
131
- if progress_callback:
132
- progress_callback(page_num + 1, total_pages)
133
-
134
- text_chunks.sort(key=lambda x: x[0])
135
- extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
136
- logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
137
- # Force OCR retry if text is too short
138
- if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
139
- logger.info("Text too short, forcing OCR for all pages")
140
- return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
141
- return extracted_text
142
  except Exception as e:
143
  logger.error("PDF processing error: %s", e)
144
  return f"PDF processing error: {str(e)}"
@@ -147,59 +95,28 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
147
  try:
148
  file_h = file_hash(file_path)
149
  cache_key = f"{file_h}_{file_type}"
150
- # Bypass cache to force fresh extraction
151
- logger.info("Forcing fresh extraction for %s", file_path)
152
- # if cache_key in cache:
153
- # logger.info("Using cached extraction for %s", file_path)
154
- # return cache[cache_key]
155
 
156
  if file_type == "pdf":
157
- text = asyncio.run(extract_all_pages_async(file_path, progress_callback, force_ocr=False))
158
  result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
159
  elif file_type == "csv":
160
- try:
161
- df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
162
- skip_blank_lines=False, on_bad_lines="skip")
163
- content = df.fillna("").astype(str).values.tolist()
164
- result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
165
- logger.info("CSV processed, rows: %d", len(content))
166
- except Exception as e:
167
- logger.error("CSV processing failed: %s", e)
168
- result = json.dumps({"error": f"CSV processing failed: {str(e)}"})
169
  elif file_type in ["xls", "xlsx"]:
170
  try:
171
- # Try all sheets to maximize data
172
- xl = pd.ExcelFile(file_path, engine="openpyxl")
173
- content = []
174
- for sheet_name in xl.sheet_names:
175
- try:
176
- df = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl", header=None, dtype=str)
177
- sheet_content = df.fillna("").astype(str).values.tolist()
178
- content.extend(sheet_content)
179
- logger.debug("Excel sheet %s processed, rows: %d", sheet_name, len(sheet_content))
180
- except Exception as e:
181
- logger.warning("Excel sheet %s failed: %s", sheet_name, e)
182
- if not content:
183
- logger.error("No valid data extracted from Excel")
184
- result = json.dumps({"error": "No valid data extracted from Excel"})
185
- else:
186
- result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
187
- logger.info("Excel processed, total rows: %d", len(content))
188
- except Exception as e:
189
- logger.error("Excel processing failed: %s", e)
190
- try:
191
- df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
192
- content = df.fillna("").astype(str).values.tolist()
193
- result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
194
- logger.info("Excel processed with xlrd, rows: %d", len(content))
195
- except Exception as e2:
196
- logger.error("Excel processing failed with xlrd: %s", e2)
197
- result = json.dumps({"error": f"Excel processing failed: {str(e)}"})
198
  else:
199
  result = json.dumps({"error": f"Unsupported file type: {file_type}"})
200
 
201
  cache[cache_key] = result
202
- logger.info("Cached extraction for %s, size: %d bytes", file_path, len(result))
203
  return result
204
  except Exception as e:
205
  logger.error("Error processing %s: %s", os.path.basename(file_path), e)
@@ -222,63 +139,66 @@ def log_system_usage(tag=""):
222
 
223
  def clean_response(text: str) -> str:
224
  text = sanitize_utf8(text)
225
- text = text.replace("[", "").replace("]", "").replace("None", "")
226
- text = text.replace("\n\n\n", "\n\n")
 
 
227
  sections = {}
228
  current_section = None
229
- seen_lines = set()
230
- for line in text.splitlines():
231
  line = line.strip()
232
- if not line or line in seen_lines:
233
  continue
234
- seen_lines.add(line)
235
  section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
236
  if section_match:
237
  current_section = section_match.group(1)
238
- sections.setdefault(current_section, [])
 
239
  continue
240
- if current_section and line.startswith("- "):
 
241
  sections[current_section].append(line)
242
- cleaned = [f"### {heading}\n" + "\n".join(findings) for heading, findings in sections.items() if findings]
243
- result = "\n\n".join(cleaned).strip()
244
- logger.debug("Cleaned response length: %d chars", len(result))
245
- return result or "No oversights identified"
246
-
247
- def summarize_findings(all_responses: List[str]) -> str:
248
- combined_response = "\n\n".join(all_responses)
249
- if not combined_response or all("No oversights identified" in resp.lower() for resp in all_responses):
250
- return "### Comprehensive Clinical Oversight Summary\nNo critical oversights were identified across the provided patient records after thorough analysis."
251
-
252
- sections = {
253
- "Missed Diagnoses": [],
254
- "Medication Conflicts": [],
255
- "Incomplete Assessments": [],
256
- "Urgent Follow-up": []
257
- }
258
  current_section = None
259
- seen_findings = set()
260
- for line in combined_response.splitlines():
261
  line = line.strip()
262
  if not line:
263
  continue
264
  section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
265
  if section_match:
266
  current_section = section_match.group(1)
 
 
267
  continue
268
- if current_section and line.startswith("- ") and line not in seen_findings:
269
- sections[current_section].append(line)
270
- seen_findings.add(line)
271
 
272
  summary_lines = []
273
  for heading, findings in sections.items():
274
  if findings:
275
- summary_lines.append(f"### {heading}")
276
- for finding in findings:
277
- summary_lines.append(f"{finding}\n - **Risks**: Potential adverse outcomes if not addressed.\n - **Recommendation**: Immediate clinical review and follow-up.")
 
 
278
 
279
- result = "### Comprehensive Clinical Oversight Summary\n" + "\n".join(summary_lines) if summary_lines else "### Comprehensive Clinical Oversight Summary\nNo critical oversights identified."
280
- logger.debug("Summary length: %d chars", len(result))
281
- return result
282
 
283
  def init_agent():
284
  logger.info("Initializing model...")
@@ -294,9 +214,7 @@ def init_agent():
294
  tool_files_dict={"new_tool": target_tool_path},
295
  force_finish=True,
296
  enable_checker=False,
297
- enable_rag=False,
298
- init_rag_num=0,
299
- step_rag_num=0,
300
  seed=100,
301
  additional_default_tools=[],
302
  )
@@ -308,8 +226,8 @@ def init_agent():
308
  def create_ui(agent):
309
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
310
  gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
311
- chatbot = gr.Chatbot(label="Detailed Analysis", height=600, type="messages", visible=False)
312
- final_summary = gr.Markdown(label="Comprehensive Clinical Oversight Summary")
313
  file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
314
  msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
315
  send_btn = gr.Button("Analyze", variant="primary")
@@ -317,14 +235,12 @@ def create_ui(agent):
317
  progress_bar = gr.Progress()
318
 
319
  prompt_template = """
320
- Analyze the patient record excerpt for clinical oversights. Provide a detailed, evidence-based summary in markdown with findings grouped under headings: Missed Diagnoses, Medication Conflicts, Incomplete Assessments, Urgent Follow-up. For each finding, include clinical context, risks, and recommendations. Output only markdown bullet points under headings. If no issues, state "No oversights identified" once.
321
-
322
- Patient Record Excerpt:
323
  {chunk}
324
  """
325
 
326
- async def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
327
- start_time = time.time()
328
  history.append({"role": "user", "content": message})
329
  yield history, None, ""
330
 
@@ -335,75 +251,73 @@ Patient Record Excerpt:
335
  progress(current / total, desc=f"Extracting text... Page {current}/{total}")
336
  return history, None, ""
337
 
338
- futures = [convert_file_to_json(f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
339
- results = [sanitize_utf8(future) for future in futures]
340
- extracted = "\n".join([json.loads(r).get("content", "") for r in results if "content" in json.loads(r)])
341
- file_hash_value = file_hash(files[0].name) if files else ""
 
342
 
343
  history.append({"role": "assistant", "content": "✅ Text extraction complete."})
344
  yield history, None, ""
345
- logger.info("Extracted text length: %d chars", len(extracted))
346
-
347
- chunk_size = 3000
348
- chunks = [extracted[i:i + chunk_size] for i in range(0, max(len(extracted), 1), chunk_size)] or [""]
349
- logger.info("Created %d chunks", len(chunks))
350
- for i, chunk in enumerate(chunks):
351
- logger.debug("Chunk %d content: %s...", i + 1, chunk[:100])
352
- all_responses = []
353
  batch_size = 2
354
 
355
  try:
356
  for batch_idx in range(0, len(chunks), batch_size):
357
  batch_chunks = chunks[batch_idx:batch_idx + batch_size]
358
- batch_prompts = [prompt_template.format(chunk=chunk[:2000]) for chunk in batch_chunks]
359
  batch_responses = []
360
 
361
  progress((batch_idx + 1) / len(chunks), desc=f"Analyzing chunks {batch_idx + 1}-{min(batch_idx + batch_size, len(chunks))}/{len(chunks)}")
362
 
363
- async def process_chunk(prompt):
364
- chunk_response = ""
365
- raw_outputs = []
366
- for chunk_output in agent.run_gradio_chat(
367
- message=prompt, history=[], temperature=0.2, max_new_tokens=512, max_token=2048, call_agent=False, conversation=[]
368
- ):
369
- if chunk_output is None:
370
- continue
371
- if isinstance(chunk_output, list):
372
- for m in chunk_output:
373
- if hasattr(m, 'content') and m.content:
374
- raw_outputs.append(m.content)
375
- cleaned = clean_response(m.content)
 
 
 
376
  chunk_response += cleaned + "\n\n"
377
- elif isinstance(chunk_output, str) and chunk_output.strip():
378
- raw_outputs.append(chunk_output)
379
- cleaned = clean_response(chunk_output)
380
- chunk_response += cleaned + "\n\n"
381
- logger.debug("Raw outputs for chunk: %s", raw_outputs[:100])
382
- logger.debug("Chunk response length: %d chars", len(chunk_response))
383
- return chunk_response
384
-
385
- futures = [process_chunk(prompt) for prompt in batch_prompts]
386
- batch_responses = await asyncio.gather(*futures)
387
- all_responses.extend([resp.strip() for resp in batch_responses if resp.strip()])
388
- torch.cuda.empty_cache()
389
- gc.collect()
390
-
391
- summary = summarize_findings(all_responses)
392
- history.append({"role": "assistant", "content": "Analysis complete. See summary below."})
393
- yield history, None, summary
394
 
 
395
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
396
  if report_path:
397
  with open(report_path, "w", encoding="utf-8") as f:
398
- f.write(summary)
399
  yield history, report_path if report_path and os.path.exists(report_path) else None, summary
400
- logger.info("Analysis took %.2f seconds", time.time() - start_time)
401
 
402
  except Exception as e:
403
  logger.error("Analysis error: %s", e)
404
  history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
405
- yield history, None, f"### Comprehensive Clinical Oversight Summary\nError occurred during analysis: {str(e)}"
406
- logger.info("Analysis took %.2f seconds", time.time() - start_time)
407
 
408
  send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
409
  msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
 
1
  import sys
2
  import os
3
  import pandas as pd
4
+ import pdfplumber
5
  import json
6
  import gradio as gr
7
  from typing import List
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
  import hashlib
10
  import shutil
11
  import re
 
16
  import gc
17
  from diskcache import Cache
18
  import time
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # Configure logging
21
  logging.basicConfig(level=logging.INFO)
 
22
  logger = logging.getLogger(__name__)
23
 
24
  # Persistent directory
 
56
  with open(path, "rb") as f:
57
  return hashlib.md5(f.read()).hexdigest()
58
 
59
+ def extract_all_pages(file_path: str, progress_callback=None) -> str:
60
  try:
 
 
 
 
61
  with pdfplumber.open(file_path) as pdf:
62
  total_pages = len(pdf.pages)
63
  if total_pages == 0:
 
64
  return ""
65
 
66
+ batch_size = 10
67
+ batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
68
+ text_chunks = [""] * total_pages
69
+ processed_pages = 0
70
+
71
+ def extract_batch(start: int, end: int) -> List[tuple]:
72
+ results = []
73
+ with pdfplumber.open(file_path) as pdf:
74
+ for page in pdf.pages[start:end]:
75
+ page_num = start + pdf.pages.index(page)
76
+ page_text = page.extract_text() or ""
77
+ results.append((page_num, f"=== Page {page_num + 1} ===\n{page_text.strip()}"))
78
+ return results
79
+
80
+ with ThreadPoolExecutor(max_workers=6) as executor:
81
+ futures = [executor.submit(extract_batch, start, end) for start, end in batches]
82
+ for future in as_completed(futures):
83
+ for page_num, text in future.result():
84
+ text_chunks[page_num] = text
85
+ processed_pages += batch_size
86
+ if progress_callback:
87
+ progress_callback(min(processed_pages, total_pages), total_pages)
88
+
89
+ return "\n\n".join(filter(None, text_chunks))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  except Exception as e:
91
  logger.error("PDF processing error: %s", e)
92
  return f"PDF processing error: {str(e)}"
 
95
  try:
96
  file_h = file_hash(file_path)
97
  cache_key = f"{file_h}_{file_type}"
98
+ if cache_key in cache:
99
+ return cache[cache_key]
 
 
 
100
 
101
  if file_type == "pdf":
102
+ text = extract_all_pages(file_path, progress_callback)
103
  result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
104
  elif file_type == "csv":
105
+ df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
106
+ skip_blank_lines=False, on_bad_lines="skip")
107
+ content = df.fillna("").astype(str).values.tolist()
108
+ result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
 
 
 
 
 
109
  elif file_type in ["xls", "xlsx"]:
110
  try:
111
+ df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
112
+ except Exception:
113
+ df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
114
+ content = df.fillna("").astype(str).values.tolist()
115
+ result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  else:
117
  result = json.dumps({"error": f"Unsupported file type: {file_type}"})
118
 
119
  cache[cache_key] = result
 
120
  return result
121
  except Exception as e:
122
  logger.error("Error processing %s: %s", os.path.basename(file_path), e)
 
139
 
140
  def clean_response(text: str) -> str:
141
  text = sanitize_utf8(text)
142
+ text = re.sub(r"\[.*?\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
143
+ text = re.sub(r"\n{3,}", "\n\n", text)
144
+ text = re.sub(r"[^\n#\-\*\w\s\.\,\:\(\)]+", "", text)
145
+
146
  sections = {}
147
  current_section = None
148
+ lines = text.splitlines()
149
+ for line in lines:
150
  line = line.strip()
151
+ if not line:
152
  continue
 
153
  section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
154
  if section_match:
155
  current_section = section_match.group(1)
156
+ if current_section not in sections:
157
+ sections[current_section] = []
158
  continue
159
+ finding_match = re.match(r"-\s*.+", line)
160
+ if finding_match and current_section and not re.match(r"-\s*No issues identified", line):
161
  sections[current_section].append(line)
162
+
163
+ cleaned = []
164
+ for heading, findings in sections.items():
165
+ if findings:
166
+ cleaned.append(f"### {heading}\n" + "\n".join(findings))
167
+
168
+ text = "\n\n".join(cleaned).strip()
169
+ return text if text else ""
170
+
171
+ def summarize_findings(combined_response: str) -> str:
172
+ if not combined_response or all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
173
+ return "### Summary of Clinical Oversights\nNo critical oversights identified in the provided records."
174
+
175
+ sections = {}
176
+ lines = combined_response.splitlines()
 
177
  current_section = None
178
+ for line in lines:
 
179
  line = line.strip()
180
  if not line:
181
  continue
182
  section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
183
  if section_match:
184
  current_section = section_match.group(1)
185
+ if current_section not in sections:
186
+ sections[current_section] = []
187
  continue
188
+ finding_match = re.match(r"-\s*(.+)", line)
189
+ if finding_match and current_section:
190
+ sections[current_section].append(finding_match.group(1))
191
 
192
  summary_lines = []
193
  for heading, findings in sections.items():
194
  if findings:
195
+ summary = f"- **{heading}**: {'; '.join(findings[:2])}. Risks: {heading.lower()} may lead to adverse outcomes. Recommend: urgent review and specialist referral."
196
+ summary_lines.append(summary)
197
+
198
+ if not summary_lines:
199
+ return "### Summary of Clinical Oversights\nNo critical oversights identified."
200
 
201
+ return "### Summary of Clinical Oversights\n" + "\n".join(summary_lines)
 
 
202
 
203
  def init_agent():
204
  logger.info("Initializing model...")
 
214
  tool_files_dict={"new_tool": target_tool_path},
215
  force_finish=True,
216
  enable_checker=False,
217
+ step_rag_num=4,
 
 
218
  seed=100,
219
  additional_default_tools=[],
220
  )
 
226
  def create_ui(agent):
227
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
228
  gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
229
+ chatbot = gr.Chatbot(label="Detailed Analysis", height=600, type="messages")
230
+ final_summary = gr.Markdown(label="Summary of Clinical Oversights")
231
  file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
232
  msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
233
  send_btn = gr.Button("Analyze", variant="primary")
 
235
  progress_bar = gr.Progress()
236
 
237
  prompt_template = """
238
+ Analyze the patient record excerpt for clinical oversights. Provide a concise, evidence-based summary in markdown with findings grouped under headings (e.g., 'Missed Diagnoses'). For each finding, include clinical context, risks, and recommendations. Output only markdown bullet points under headings. If no issues, state "No issues identified".
239
+ Patient Record Excerpt (Chunk {0} of {1}):
 
240
  {chunk}
241
  """
242
 
243
+ def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
 
244
  history.append({"role": "user", "content": message})
245
  yield history, None, ""
246
 
 
251
  progress(current / total, desc=f"Extracting text... Page {current}/{total}")
252
  return history, None, ""
253
 
254
+ with ThreadPoolExecutor(max_workers=6) as executor:
255
+ futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
256
+ results = [sanitize_utf8(f.result()) for f in as_completed(futures)]
257
+ extracted = "\n".join(results)
258
+ file_hash_value = file_hash(files[0].name) if files else ""
259
 
260
  history.append({"role": "assistant", "content": "✅ Text extraction complete."})
261
  yield history, None, ""
262
+
263
+ chunk_size = 6000
264
+ chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
265
+ combined_response = ""
 
 
 
 
266
  batch_size = 2
267
 
268
  try:
269
  for batch_idx in range(0, len(chunks), batch_size):
270
  batch_chunks = chunks[batch_idx:batch_idx + batch_size]
271
+ batch_prompts = [prompt_template.format(i + 1, len(chunks), chunk=chunk[:4000]) for i, chunk in enumerate(batch_chunks)]
272
  batch_responses = []
273
 
274
  progress((batch_idx + 1) / len(chunks), desc=f"Analyzing chunks {batch_idx + 1}-{min(batch_idx + batch_size, len(chunks))}/{len(chunks)}")
275
 
276
+ with ThreadPoolExecutor(max_workers=len(batch_chunks)) as executor:
277
+ futures = [executor.submit(agent.run_gradio_chat, prompt, [], 0.2, 512, 2048, False, []) for prompt in batch_prompts]
278
+ for future in as_completed(futures):
279
+ chunk_response = ""
280
+ for chunk_output in future.result():
281
+ if chunk_output is None:
282
+ continue
283
+ if isinstance(chunk_output, list):
284
+ for m in chunk_output:
285
+ if hasattr(m, 'content') and m.content:
286
+ cleaned = clean_response(m.content)
287
+ if cleaned and re.search(r"###\s*\w+", cleaned):
288
+ chunk_response += cleaned + "\n\n"
289
+ elif isinstance(chunk_output, str) and chunk_output.strip():
290
+ cleaned = clean_response(m.content)
291
+ if cleaned and re.search(r"###\s*\w+", cleaned):
292
  chunk_response += cleaned + "\n\n"
293
+ batch_responses.append(chunk_response)
294
+ torch.cuda.empty_cache()
295
+ gc.collect()
296
+
297
+ for chunk_idx, chunk_response in enumerate(batch_responses, batch_idx + 1):
298
+ if chunk_response:
299
+ combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
300
+ else:
301
+ combined_response += f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"
302
+ history[-1] = {"role": "assistant", "content": combined_response.strip()}
303
+ yield history, None, ""
304
+
305
+ if combined_response.strip() and not all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
306
+ history[-1]["content"] = combined_response.strip()
307
+ else:
308
+ history.append({"role": "assistant", "content": "No oversights identified in the provided records."})
 
309
 
310
+ summary = summarize_findings(combined_response)
311
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
312
  if report_path:
313
  with open(report_path, "w", encoding="utf-8") as f:
314
+ f.write(combined_response + "\n\n" + summary)
315
  yield history, report_path if report_path and os.path.exists(report_path) else None, summary
 
316
 
317
  except Exception as e:
318
  logger.error("Analysis error: %s", e)
319
  history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
320
+ yield history, None, f"### Summary of Clinical Oversights\nError occurred during analysis: {str(e)}"
 
321
 
322
  send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
323
  msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])