Update app.py
Browse files
app.py
CHANGED
@@ -27,11 +27,11 @@ logger = logging.getLogger(__name__)
|
|
27 |
|
28 |
# Constants
|
29 |
MAX_TOKENS = 1800
|
30 |
-
BATCH_SIZE = 1
|
31 |
MAX_WORKERS = 2
|
32 |
-
CHUNK_SIZE = 5
|
33 |
MODEL_MAX_TOKENS = 131072
|
34 |
-
MAX_TEXT_LENGTH = 500000
|
35 |
|
36 |
# Persistent directory setup
|
37 |
persistent_dir = "/data/hf_cache"
|
@@ -63,17 +63,14 @@ from txagent.txagent import TxAgent
|
|
63 |
# Initialize cache with 10GB limit
|
64 |
cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
|
65 |
|
66 |
-
# Initialize tokenizer for precise chunking (with caching)
|
67 |
@lru_cache(maxsize=1)
|
68 |
def get_tokenizer():
|
69 |
return AutoTokenizer.from_pretrained("mims-harvard/TxAgent-T1-Llama-3.1-8B")
|
70 |
|
71 |
def sanitize_utf8(text: str) -> str:
|
72 |
-
"""Optimized UTF-8 sanitization"""
|
73 |
return text.encode("utf-8", "ignore").decode("utf-8")
|
74 |
|
75 |
def file_hash(path: str) -> str:
|
76 |
-
"""Optimized file hashing with buffer reading"""
|
77 |
hash_md5 = hashlib.md5()
|
78 |
with open(path, "rb") as f:
|
79 |
for chunk in iter(lambda: f.read(4096), b""):
|
@@ -81,12 +78,10 @@ def file_hash(path: str) -> str:
|
|
81 |
return hash_md5.hexdigest()
|
82 |
|
83 |
def extract_pdf_page(page, tokenizer, max_tokens=MAX_TOKENS) -> List[str]:
|
84 |
-
"""Extract and chunk a single page with token limit"""
|
85 |
try:
|
86 |
text = page.extract_text() or ""
|
87 |
text = sanitize_utf8(text)
|
88 |
-
if len(text) > MAX_TEXT_LENGTH // 10:
|
89 |
-
logger.warning(f"Page {page.page_number} text too long ({len(text)}). Truncating.")
|
90 |
text = text[:MAX_TEXT_LENGTH // 10]
|
91 |
|
92 |
tokens = tokenizer.encode(text, add_special_tokens=False)
|
@@ -111,7 +106,6 @@ def extract_pdf_page(page, tokenizer, max_tokens=MAX_TOKENS) -> List[str]:
|
|
111 |
return []
|
112 |
|
113 |
def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
|
114 |
-
"""Extract PDF pages with early token-based chunking"""
|
115 |
try:
|
116 |
tokenizer = get_tokenizer()
|
117 |
with pdfplumber.open(file_path) as pdf:
|
@@ -134,7 +128,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
|
|
134 |
for chunk in page_chunks:
|
135 |
chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
|
136 |
if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
|
137 |
-
logger.warning(f"Total tokens
|
138 |
return results
|
139 |
results.append(chunk)
|
140 |
total_tokens += chunk_tokens
|
@@ -151,31 +145,56 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
|
|
151 |
return [f"PDF processing error: {str(e)}"]
|
152 |
|
153 |
def excel_to_json(file_path: str) -> List[Dict]:
|
154 |
-
"""Optimized Excel processing with chunking"""
|
155 |
try:
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
try:
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
except Exception as e:
|
174 |
-
logger.error(f"Excel
|
175 |
-
return [{"error": f"Excel
|
176 |
|
177 |
def csv_to_json(file_path: str) -> List[Dict]:
|
178 |
-
"""Optimized CSV processing with chunking"""
|
179 |
try:
|
180 |
chunks = []
|
181 |
for chunk in pd.read_csv(
|
@@ -201,7 +220,6 @@ def csv_to_json(file_path: str) -> List[Dict]:
|
|
201 |
|
202 |
@lru_cache(maxsize=100)
|
203 |
def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
|
204 |
-
"""Cached file processing with memory optimization"""
|
205 |
try:
|
206 |
if file_type == "pdf":
|
207 |
chunks = extract_all_pages(file_path)
|
@@ -218,80 +236,17 @@ def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
|
|
218 |
else:
|
219 |
return [{"error": f"Unsupported file type: {file_type}"}]
|
220 |
except Exception as e:
|
221 |
-
logger.error(f"Error processing
|
222 |
-
return [{"error": f"Error processing
|
223 |
-
|
224 |
-
def tokenize_and_chunk(text: str, max_tokens: int = MAX_TOKENS) -> List[str]:
|
225 |
-
"""Optimized tokenization and chunking with early validation"""
|
226 |
-
if len(text) > MAX_TEXT_LENGTH:
|
227 |
-
logger.warning(f"Text length ({len(text)}) exceeds limit ({MAX_TEXT_LENGTH}). Truncating.")
|
228 |
-
text = text[:MAX_TEXT_LENGTH]
|
229 |
-
|
230 |
-
tokenizer = get_tokenizer()
|
231 |
-
tokens = tokenizer.encode(text, add_special_tokens=False)
|
232 |
-
if len(tokens) > MODEL_MAX_TOKENS:
|
233 |
-
logger.error(f"Token count ({len(tokens)}) exceeds model limit ({MODEL_MAX_TOKENS}).")
|
234 |
-
return [text[:MAX_TEXT_LENGTH // 10]] # Fallback to small chunk
|
235 |
-
|
236 |
-
chunks = []
|
237 |
-
current_chunk = []
|
238 |
-
current_length = 0
|
239 |
-
|
240 |
-
for token in tokens:
|
241 |
-
if current_length + 1 > max_tokens:
|
242 |
-
chunks.append(tokenizer.decode(current_chunk))
|
243 |
-
current_chunk = [token]
|
244 |
-
current_length = 1
|
245 |
-
else:
|
246 |
-
current_chunk.append(token)
|
247 |
-
current_length += 1
|
248 |
-
|
249 |
-
if current_chunk:
|
250 |
-
chunks.append(tokenizer.decode(current_chunk))
|
251 |
-
|
252 |
-
return chunks
|
253 |
-
|
254 |
-
def log_system_usage(tag=""):
|
255 |
-
"""Optimized system monitoring"""
|
256 |
-
try:
|
257 |
-
cpu = psutil.cpu_percent(interval=0.5)
|
258 |
-
mem = psutil.virtual_memory()
|
259 |
-
logger.info(f"[{tag}] CPU: {cpu:.1f}% | RAM: {mem.used // (1024**2)}MB / {mem.total // (1024**2)}MB")
|
260 |
-
|
261 |
-
try:
|
262 |
-
result = subprocess.run(
|
263 |
-
["nvidia-smi", "--query-gpu=memory.used,memory.total,utilization.gpu", "--format=csv,nounits,noheader"],
|
264 |
-
capture_output=True,
|
265 |
-
text=True,
|
266 |
-
timeout=2
|
267 |
-
)
|
268 |
-
if result.returncode == 0:
|
269 |
-
used, total, util = result.stdout.strip().split(", ")
|
270 |
-
logger.info(f"[{tag}] GPU: {used}MB / {total}MB | Utilization: {util}%")
|
271 |
-
except subprocess.TimeoutExpired:
|
272 |
-
logger.warning(f"[{tag}] GPU monitoring timed out")
|
273 |
-
except Exception as e:
|
274 |
-
logger.error(f"[{tag}] Monitor failed: {e}")
|
275 |
|
276 |
def clean_response(text: str) -> str:
|
277 |
-
"""Enhanced response cleaning with aggressive deduplication"""
|
278 |
if not text:
|
279 |
return ""
|
280 |
|
281 |
patterns = [
|
282 |
(re.compile(r"\[.*?\]|\bNone\b", re.IGNORECASE), ""),
|
283 |
-
(re.compile(r"(The patient record excerpt provides|Patient record excerpt contains).*?(John Doe|general information).*?\.", re.IGNORECASE), ""),
|
284 |
-
(re.compile(r"To (analyze|proceed).*?medications\.", re.IGNORECASE), ""),
|
285 |
-
(re.compile(r"Since the previous attempts.*?\.", re.IGNORECASE), ""),
|
286 |
-
(re.compile(r"I need to.*?results\.", re.IGNORECASE), ""),
|
287 |
-
(re.compile(r"(Therefore, )?(Retrieving|I will start by retrieving) tools.*?\.", re.IGNORECASE), ""),
|
288 |
-
(re.compile(r"This requires reviewing.*?\.", re.IGNORECASE), ""),
|
289 |
-
(re.compile(r"Given the context, it is important to review.*?\.", re.IGNORECASE), ""),
|
290 |
-
(re.compile(r"Final Analysis\s*", re.IGNORECASE), ""),
|
291 |
-
(re.compile(r"Therefore, no missed diagnoses can be identified.*?\.", re.IGNORECASE), ""),
|
292 |
(re.compile(r"\s+"), " "),
|
293 |
(re.compile(r"[^\w\s\.\,\(\)\-]"), ""),
|
294 |
-
(re.compile(r"(No missed diagnoses identified\.)\s*\1+", re.IGNORECASE), r"\1"),
|
295 |
]
|
296 |
|
297 |
for pattern, repl in patterns:
|
@@ -314,74 +269,11 @@ def clean_response(text: str) -> str:
|
|
314 |
seen.add(s)
|
315 |
|
316 |
text = ". ".join(unique_sentences).strip()
|
317 |
-
|
318 |
return text if text else "No missed diagnoses identified."
|
319 |
|
320 |
-
def summarize_findings(combined_response: str) -> str:
|
321 |
-
"""Enhanced findings summarization for a single, concise paragraph"""
|
322 |
-
if not combined_response:
|
323 |
-
return "No missed diagnoses were identified in the provided records."
|
324 |
-
|
325 |
-
diagnosis_pattern = re.compile(r"-\s*(.+)$")
|
326 |
-
section_pattern = re.compile(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)")
|
327 |
-
no_issues_pattern = re.compile(r"No issues identified|No missed diagnoses identified", re.IGNORECASE)
|
328 |
-
|
329 |
-
diagnoses = []
|
330 |
-
current_section = None
|
331 |
-
|
332 |
-
for line in combined_response.splitlines():
|
333 |
-
line = line.strip()
|
334 |
-
if not line:
|
335 |
-
continue
|
336 |
-
|
337 |
-
section_match = section_pattern.match(line)
|
338 |
-
if section_match:
|
339 |
-
current_section = "diagnoses" if section_match.group(1) == "Missed Diagnoses" else None
|
340 |
-
continue
|
341 |
-
|
342 |
-
if current_section == "diagnoses":
|
343 |
-
diagnosis_match = diagnosis_pattern.match(line)
|
344 |
-
if diagnosis_match and not no_issues_pattern.search(line):
|
345 |
-
diagnosis = diagnosis_match.group(1).strip()
|
346 |
-
if diagnosis:
|
347 |
-
diagnoses.append(diagnosis)
|
348 |
-
|
349 |
-
medication_pattern = re.compile(r"medications includ(?:e|ing|ed) ([^\.]+)", re.IGNORECASE)
|
350 |
-
evaluation_pattern = re.compile(r"psychiatric evaluation.*?mention of ([^\.]+)", re.IGNORECASE)
|
351 |
-
|
352 |
-
for line in combined_response.splitlines():
|
353 |
-
line = line.strip()
|
354 |
-
if not line or no_issues_pattern.search(line):
|
355 |
-
continue
|
356 |
-
|
357 |
-
med_match = medication_pattern.search(line)
|
358 |
-
if med_match:
|
359 |
-
meds = med_match.group(1).strip()
|
360 |
-
diagnoses.append(f"use of medications ({meds}), suggesting an undiagnosed psychiatric condition requiring urgent review")
|
361 |
-
|
362 |
-
eval_match = evaluation_pattern.search(line)
|
363 |
-
if eval_match:
|
364 |
-
details = eval_match.group(1).strip()
|
365 |
-
diagnoses.append(f"psychiatric evaluation noting {details}, indicating a potential missed psychiatric diagnosis requiring urgent review")
|
366 |
-
|
367 |
-
if not diagnoses:
|
368 |
-
return "No missed diagnoses were identified in the provided records."
|
369 |
-
|
370 |
-
seen = set()
|
371 |
-
unique_diagnoses = [d for d in diagnoses if not (d in seen or seen.add(d))]
|
372 |
-
|
373 |
-
summary = "The patient record indicates missed diagnoses including "
|
374 |
-
summary += ", ".join(unique_diagnoses[:-1])
|
375 |
-
summary += f", and {unique_diagnoses[-1]}" if len(unique_diagnoses) > 1 else unique_diagnoses[0]
|
376 |
-
summary += ". These findings suggest potential oversights in the patient's medical evaluation and require urgent clinical review to prevent adverse outcomes."
|
377 |
-
|
378 |
-
return summary
|
379 |
-
|
380 |
@lru_cache(maxsize=1)
|
381 |
def init_agent():
|
382 |
-
"""Cached agent initialization with memory optimization"""
|
383 |
logger.info("Initializing model...")
|
384 |
-
log_system_usage("Before Load")
|
385 |
|
386 |
default_tool_path = os.path.abspath("data/new_tool.json")
|
387 |
target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
|
@@ -399,15 +291,12 @@ def init_agent():
|
|
399 |
additional_default_tools=[],
|
400 |
)
|
401 |
agent.init_model()
|
402 |
-
|
403 |
-
log_system_usage("After Load")
|
404 |
logger.info("Agent Ready")
|
405 |
return agent
|
406 |
|
407 |
def create_ui(agent):
|
408 |
-
"""Optimized UI creation with pre-compiled templates"""
|
409 |
PROMPT_TEMPLATE = """
|
410 |
-
Analyze the patient record excerpt for missed diagnoses
|
411 |
Patient Record Excerpt (Chunk {0} of {1}):
|
412 |
{chunk}
|
413 |
"""
|
@@ -417,8 +306,8 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
417 |
|
418 |
with gr.Row():
|
419 |
with gr.Column(scale=3):
|
420 |
-
chatbot = gr.Chatbot(label="Analysis Summary", height=600
|
421 |
-
msg_input = gr.Textbox(placeholder="Ask about potential oversights..."
|
422 |
send_btn = gr.Button("Analyze", variant="primary")
|
423 |
file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
|
424 |
|
@@ -428,7 +317,6 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
428 |
progress_bar = gr.Progress()
|
429 |
|
430 |
def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
|
431 |
-
"""Optimized analysis pipeline with quick summary and background report"""
|
432 |
history.append({"role": "user", "content": message})
|
433 |
yield history, None, ""
|
434 |
|
@@ -442,31 +330,38 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
442 |
|
443 |
if cache_key in cache:
|
444 |
extracted.extend(cache[cache_key])
|
|
|
|
|
445 |
else:
|
446 |
result = process_file_cached(f.name, file_type)
|
447 |
-
|
448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
|
450 |
file_hash_value = file_hash(files[0].name) if files else ""
|
451 |
-
history.append({"role": "assistant", "content": "✅ File processing complete"})
|
452 |
-
yield history, None, ""
|
453 |
|
454 |
if not extracted:
|
455 |
-
history.append({"role": "assistant", "content": "❌ No valid content extracted
|
456 |
-
yield history, None, "No valid content extracted
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
return
|
458 |
|
459 |
combined_response = ""
|
460 |
report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
|
461 |
-
seen_responses = set()
|
462 |
|
463 |
try:
|
464 |
-
chunks = [item["content"] for item in extracted if "content" in item]
|
465 |
-
if not chunks:
|
466 |
-
history.append({"role": "assistant", "content": "❌ No processable content found in the file."})
|
467 |
-
yield history, None, "No processable content found."
|
468 |
-
return
|
469 |
-
|
470 |
for batch_idx in range(0, len(chunks), BATCH_SIZE):
|
471 |
batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
|
472 |
|
@@ -483,7 +378,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
483 |
desc=f"Processing batch {(batch_idx // BATCH_SIZE) + 1}/{(len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE}")
|
484 |
|
485 |
with ThreadPoolExecutor(max_workers=min(BATCH_SIZE, MAX_WORKERS)) as executor:
|
486 |
-
|
487 |
executor.submit(
|
488 |
agent.run_quick_summary,
|
489 |
chunk, 0.2, 256, 1024
|
@@ -491,74 +386,31 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
491 |
for idx, chunk in enumerate(batch_chunks)
|
492 |
}
|
493 |
|
494 |
-
for future in as_completed(
|
495 |
-
chunk_idx =
|
496 |
-
try:
|
497 |
-
quick_response = clean_response(future.result())
|
498 |
-
if quick_response and quick_response != "No missed diagnoses identified.":
|
499 |
-
is_unique = True
|
500 |
-
for seen_response in seen_responses:
|
501 |
-
if SequenceMatcher(None, quick_response.lower(), seen_response.lower()).ratio() > 0.9:
|
502 |
-
is_unique = False
|
503 |
-
break
|
504 |
-
if is_unique:
|
505 |
-
combined_response += f"--- Quick Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{quick_response}\n"
|
506 |
-
seen_responses.add(quick_response)
|
507 |
-
history[-1] = {"role": "assistant", "content": combined_response.strip()}
|
508 |
-
yield history, None, ""
|
509 |
-
except Exception as e:
|
510 |
-
logger.error(f"Quick summary error for chunk {batch_idx + chunk_idx + 1}: {e}")
|
511 |
-
history[-1] = {"role": "assistant", "content": f"Error processing chunk {batch_idx + chunk_idx + 1}: {str(e)}"}
|
512 |
-
yield history, None, ""
|
513 |
-
finally:
|
514 |
-
del future
|
515 |
-
torch.cuda.empty_cache()
|
516 |
-
gc.collect()
|
517 |
-
|
518 |
-
# Start background detailed analysis
|
519 |
-
with ThreadPoolExecutor(max_workers=min(BATCH_SIZE, MAX_WORKERS)) as executor:
|
520 |
-
detailed_futures = {
|
521 |
-
executor.submit(
|
522 |
-
agent.run_gradio_chat,
|
523 |
-
prompt, [], 0.2, 512, 2048, False, None, 3, None, 0, None, report_path
|
524 |
-
): idx
|
525 |
-
for idx, prompt in enumerate(batch_prompts)
|
526 |
-
}
|
527 |
-
|
528 |
-
for future in as_completed(detailed_futures):
|
529 |
-
chunk_idx = detailed_futures[future]
|
530 |
try:
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
if cleaned_content and cleaned_content != "No missed diagnoses identified.":
|
537 |
-
combined_response += cleaned_content + "\n"
|
538 |
-
history[-1] = {"role": "assistant", "content": combined_response.strip()}
|
539 |
-
yield history, report_path, ""
|
540 |
except Exception as e:
|
541 |
-
logger.error(f"
|
542 |
-
history[-1] = {"role": "assistant", "content": f"Error
|
543 |
yield history, None, ""
|
544 |
finally:
|
545 |
del future
|
546 |
torch.cuda.empty_cache()
|
547 |
gc.collect()
|
548 |
|
549 |
-
summary =
|
550 |
-
|
551 |
-
|
552 |
-
history.append({"role": "assistant", "content": "Detailed report ready for download."})
|
553 |
-
yield history, report_path, summary
|
554 |
-
else:
|
555 |
-
history.append({"role": "assistant", "content": "Detailed report still processing."})
|
556 |
-
yield history, None, summary
|
557 |
|
558 |
except Exception as e:
|
559 |
logger.error(f"Analysis error: {e}")
|
560 |
history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
|
561 |
-
yield history, None, f"Error occurred
|
562 |
finally:
|
563 |
torch.cuda.empty_cache()
|
564 |
gc.collect()
|
@@ -578,22 +430,14 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
578 |
|
579 |
if __name__ == "__main__":
|
580 |
try:
|
581 |
-
logger.info("Launching
|
582 |
agent = init_agent()
|
583 |
demo = create_ui(agent)
|
584 |
-
demo.queue(
|
585 |
-
api_open=False,
|
586 |
-
max_size=20
|
587 |
-
).launch(
|
588 |
server_name="0.0.0.0",
|
589 |
server_port=7860,
|
590 |
-
show_error=True
|
591 |
-
allowed_paths=[report_dir],
|
592 |
-
share=False
|
593 |
)
|
594 |
except Exception as e:
|
595 |
logger.error(f"Fatal error: {e}")
|
596 |
-
raise
|
597 |
-
finally:
|
598 |
-
if torch.distributed.is_initialized():
|
599 |
-
torch.distributed.destroy_process_group()
|
|
|
27 |
|
28 |
# Constants
|
29 |
MAX_TOKENS = 1800
|
30 |
+
BATCH_SIZE = 1
|
31 |
MAX_WORKERS = 2
|
32 |
+
CHUNK_SIZE = 5
|
33 |
MODEL_MAX_TOKENS = 131072
|
34 |
+
MAX_TEXT_LENGTH = 500000
|
35 |
|
36 |
# Persistent directory setup
|
37 |
persistent_dir = "/data/hf_cache"
|
|
|
63 |
# Initialize cache with 10GB limit
|
64 |
cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
|
65 |
|
|
|
66 |
@lru_cache(maxsize=1)
|
67 |
def get_tokenizer():
|
68 |
return AutoTokenizer.from_pretrained("mims-harvard/TxAgent-T1-Llama-3.1-8B")
|
69 |
|
70 |
def sanitize_utf8(text: str) -> str:
|
|
|
71 |
return text.encode("utf-8", "ignore").decode("utf-8")
|
72 |
|
73 |
def file_hash(path: str) -> str:
|
|
|
74 |
hash_md5 = hashlib.md5()
|
75 |
with open(path, "rb") as f:
|
76 |
for chunk in iter(lambda: f.read(4096), b""):
|
|
|
78 |
return hash_md5.hexdigest()
|
79 |
|
80 |
def extract_pdf_page(page, tokenizer, max_tokens=MAX_TOKENS) -> List[str]:
|
|
|
81 |
try:
|
82 |
text = page.extract_text() or ""
|
83 |
text = sanitize_utf8(text)
|
84 |
+
if len(text) > MAX_TEXT_LENGTH // 10:
|
|
|
85 |
text = text[:MAX_TEXT_LENGTH // 10]
|
86 |
|
87 |
tokens = tokenizer.encode(text, add_special_tokens=False)
|
|
|
106 |
return []
|
107 |
|
108 |
def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
|
|
|
109 |
try:
|
110 |
tokenizer = get_tokenizer()
|
111 |
with pdfplumber.open(file_path) as pdf:
|
|
|
128 |
for chunk in page_chunks:
|
129 |
chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
|
130 |
if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
|
131 |
+
logger.warning(f"Total tokens exceed model limit. Stopping.")
|
132 |
return results
|
133 |
results.append(chunk)
|
134 |
total_tokens += chunk_tokens
|
|
|
145 |
return [f"PDF processing error: {str(e)}"]
|
146 |
|
147 |
def excel_to_json(file_path: str) -> List[Dict]:
|
|
|
148 |
try:
|
149 |
+
# Try with openpyxl first
|
150 |
+
try:
|
151 |
+
with pd.ExcelFile(file_path, engine='openpyxl') as excel_file:
|
152 |
+
sheets = excel_file.sheet_names
|
153 |
+
results = []
|
154 |
+
for sheet_name in sheets:
|
155 |
+
df = pd.read_excel(
|
156 |
+
excel_file,
|
157 |
+
sheet_name=sheet_name,
|
158 |
+
header=None,
|
159 |
+
dtype=str,
|
160 |
+
na_filter=False
|
161 |
+
)
|
162 |
+
if not df.empty:
|
163 |
+
results.append({
|
164 |
+
"filename": f"{os.path.basename(file_path)} - {sheet_name}",
|
165 |
+
"rows": df.values.tolist(),
|
166 |
+
"type": "excel"
|
167 |
+
})
|
168 |
+
return results if results else [{"error": "No data found in any sheet"}]
|
169 |
+
except Exception as openpyxl_error:
|
170 |
+
# Fallback to xlrd
|
171 |
try:
|
172 |
+
with pd.ExcelFile(file_path, engine='xlrd') as excel_file:
|
173 |
+
sheets = excel_file.sheet_names
|
174 |
+
results = []
|
175 |
+
for sheet_name in sheets:
|
176 |
+
df = pd.read_excel(
|
177 |
+
excel_file,
|
178 |
+
sheet_name=sheet_name,
|
179 |
+
header=None,
|
180 |
+
dtype=str,
|
181 |
+
na_filter=False
|
182 |
+
)
|
183 |
+
if not df.empty:
|
184 |
+
results.append({
|
185 |
+
"filename": f"{os.path.basename(file_path)} - {sheet_name}",
|
186 |
+
"rows": df.values.tolist(),
|
187 |
+
"type": "excel"
|
188 |
+
})
|
189 |
+
return results if results else [{"error": "No data found in any sheet"}]
|
190 |
+
except Exception as xlrd_error:
|
191 |
+
logger.error(f"Excel processing failed: {xlrd_error}")
|
192 |
+
return [{"error": f"Excel processing failed: {str(xlrd_error)}"}]
|
193 |
except Exception as e:
|
194 |
+
logger.error(f"Excel file opening error: {e}")
|
195 |
+
return [{"error": f"Excel file opening error: {str(e)}"}]
|
196 |
|
197 |
def csv_to_json(file_path: str) -> List[Dict]:
|
|
|
198 |
try:
|
199 |
chunks = []
|
200 |
for chunk in pd.read_csv(
|
|
|
220 |
|
221 |
@lru_cache(maxsize=100)
|
222 |
def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
|
|
|
223 |
try:
|
224 |
if file_type == "pdf":
|
225 |
chunks = extract_all_pages(file_path)
|
|
|
236 |
else:
|
237 |
return [{"error": f"Unsupported file type: {file_type}"}]
|
238 |
except Exception as e:
|
239 |
+
logger.error(f"Error processing file: {e}")
|
240 |
+
return [{"error": f"Error processing file: {str(e)}"}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
def clean_response(text: str) -> str:
|
|
|
243 |
if not text:
|
244 |
return ""
|
245 |
|
246 |
patterns = [
|
247 |
(re.compile(r"\[.*?\]|\bNone\b", re.IGNORECASE), ""),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
(re.compile(r"\s+"), " "),
|
249 |
(re.compile(r"[^\w\s\.\,\(\)\-]"), ""),
|
|
|
250 |
]
|
251 |
|
252 |
for pattern, repl in patterns:
|
|
|
269 |
seen.add(s)
|
270 |
|
271 |
text = ". ".join(unique_sentences).strip()
|
|
|
272 |
return text if text else "No missed diagnoses identified."
|
273 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
@lru_cache(maxsize=1)
|
275 |
def init_agent():
|
|
|
276 |
logger.info("Initializing model...")
|
|
|
277 |
|
278 |
default_tool_path = os.path.abspath("data/new_tool.json")
|
279 |
target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
|
|
|
291 |
additional_default_tools=[],
|
292 |
)
|
293 |
agent.init_model()
|
|
|
|
|
294 |
logger.info("Agent Ready")
|
295 |
return agent
|
296 |
|
297 |
def create_ui(agent):
|
|
|
298 |
PROMPT_TEMPLATE = """
|
299 |
+
Analyze the patient record excerpt for missed diagnoses. Provide detailed, evidence-based analysis.
|
300 |
Patient Record Excerpt (Chunk {0} of {1}):
|
301 |
{chunk}
|
302 |
"""
|
|
|
306 |
|
307 |
with gr.Row():
|
308 |
with gr.Column(scale=3):
|
309 |
+
chatbot = gr.Chatbot(label="Analysis Summary", height=600)
|
310 |
+
msg_input = gr.Textbox(placeholder="Ask about potential oversights...")
|
311 |
send_btn = gr.Button("Analyze", variant="primary")
|
312 |
file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
|
313 |
|
|
|
317 |
progress_bar = gr.Progress()
|
318 |
|
319 |
def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
|
|
|
320 |
history.append({"role": "user", "content": message})
|
321 |
yield history, None, ""
|
322 |
|
|
|
330 |
|
331 |
if cache_key in cache:
|
332 |
extracted.extend(cache[cache_key])
|
333 |
+
history.append({"role": "assistant", "content": f"Using cached data for {os.path.basename(f.name)}"})
|
334 |
+
yield history, None, ""
|
335 |
else:
|
336 |
result = process_file_cached(f.name, file_type)
|
337 |
+
if result and not (len(result) == 1 and "error" in result[0]):
|
338 |
+
cache[cache_key] = result
|
339 |
+
extracted.extend(result)
|
340 |
+
history.append({"role": "assistant", "content": f"Processed {os.path.basename(f.name)}"})
|
341 |
+
yield history, None, ""
|
342 |
+
else:
|
343 |
+
error_msg = result[0]["error"] if result else "Unknown error"
|
344 |
+
history.append({"role": "assistant", "content": f"Failed to process {os.path.basename(f.name)}: {error_msg}"})
|
345 |
+
yield history, None, error_msg
|
346 |
+
return
|
347 |
|
348 |
file_hash_value = file_hash(files[0].name) if files else ""
|
|
|
|
|
349 |
|
350 |
if not extracted:
|
351 |
+
history.append({"role": "assistant", "content": "❌ No valid content extracted"})
|
352 |
+
yield history, None, "No valid content extracted"
|
353 |
+
return
|
354 |
+
|
355 |
+
chunks = [item["content"] for item in extracted if "content" in item]
|
356 |
+
if not chunks:
|
357 |
+
history.append({"role": "assistant", "content": "❌ No processable content found"})
|
358 |
+
yield history, None, "No processable content found"
|
359 |
return
|
360 |
|
361 |
combined_response = ""
|
362 |
report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
|
|
|
363 |
|
364 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
for batch_idx in range(0, len(chunks), BATCH_SIZE):
|
366 |
batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
|
367 |
|
|
|
378 |
desc=f"Processing batch {(batch_idx // BATCH_SIZE) + 1}/{(len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE}")
|
379 |
|
380 |
with ThreadPoolExecutor(max_workers=min(BATCH_SIZE, MAX_WORKERS)) as executor:
|
381 |
+
futures = {
|
382 |
executor.submit(
|
383 |
agent.run_quick_summary,
|
384 |
chunk, 0.2, 256, 1024
|
|
|
386 |
for idx, chunk in enumerate(batch_chunks)
|
387 |
}
|
388 |
|
389 |
+
for future in as_completed(futures):
|
390 |
+
chunk_idx = futures[future]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
try:
|
392 |
+
response = clean_response(future.result())
|
393 |
+
if response:
|
394 |
+
combined_response += f"--- Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{response}\n"
|
395 |
+
history[-1] = {"role": "assistant", "content": combined_response.strip()}
|
396 |
+
yield history, None, ""
|
|
|
|
|
|
|
|
|
397 |
except Exception as e:
|
398 |
+
logger.error(f"Chunk processing error: {e}")
|
399 |
+
history[-1] = {"role": "assistant", "content": f"Error processing chunk: {str(e)}"}
|
400 |
yield history, None, ""
|
401 |
finally:
|
402 |
del future
|
403 |
torch.cuda.empty_cache()
|
404 |
gc.collect()
|
405 |
|
406 |
+
summary = "Analysis complete. " + ("Download full report below." if report_path and os.path.exists(report_path) else "")
|
407 |
+
history.append({"role": "assistant", "content": "Analysis completed successfully"})
|
408 |
+
yield history, report_path, summary
|
|
|
|
|
|
|
|
|
|
|
409 |
|
410 |
except Exception as e:
|
411 |
logger.error(f"Analysis error: {e}")
|
412 |
history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
|
413 |
+
yield history, None, f"Error occurred: {str(e)}"
|
414 |
finally:
|
415 |
torch.cuda.empty_cache()
|
416 |
gc.collect()
|
|
|
430 |
|
431 |
if __name__ == "__main__":
|
432 |
try:
|
433 |
+
logger.info("Launching app...")
|
434 |
agent = init_agent()
|
435 |
demo = create_ui(agent)
|
436 |
+
demo.queue().launch(
|
|
|
|
|
|
|
437 |
server_name="0.0.0.0",
|
438 |
server_port=7860,
|
439 |
+
show_error=True
|
|
|
|
|
440 |
)
|
441 |
except Exception as e:
|
442 |
logger.error(f"Fatal error: {e}")
|
443 |
+
raise
|
|
|
|
|
|