CPS-Test-Mobile

Paused

File size: 15,995 Bytes

import sys
import os
import pandas as pd
import pdfplumber
import json
import gradio as gr
from typing import List, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
import hashlib
import shutil
import re
import psutil
import subprocess
from datetime import datetime

# Persistent directory setup
persistent_dir = "/data/hf_cache"
os.makedirs(persistent_dir, exist_ok=True)

model_cache_dir = os.path.join(persistent_dir, "txagent_models")
tool_cache_dir = os.path.join(persistent_dir, "tool_cache")
file_cache_dir = os.path.join(persistent_dir, "cache")
report_dir = os.path.join(persistent_dir, "reports")
vllm_cache_dir = os.path.join(persistent_dir, "vllm_cache")

for directory in [model_cache_dir, tool_cache_dir, file_cache_dir, report_dir, vllm_cache_dir]:
    os.makedirs(directory, exist_ok=True)

os.environ["HF_HOME"] = model_cache_dir
os.environ["TRANSFORMERS_CACHE"] = model_cache_dir
os.environ["VLLM_CACHE_DIR"] = vllm_cache_dir
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

current_dir = os.path.dirname(os.path.abspath(__file__))
src_path = os.path.abspath(os.path.join(current_dir, "src"))
sys.path.insert(0, src_path)

from txagent.txagent import TxAgent

# Constants
MEDICAL_KEYWORDS = {
    'diagnosis', 'assessment', 'plan', 'results', 'medications',
    'allergies', 'summary', 'impression', 'findings', 'recommendations',
    'conclusion', 'history', 'examination', 'progress', 'discharge'
}
CHUNK_SIZE = 10000  # Increased chunk size for better context
MAX_TOKENS = 12000  # Maximum tokens for analysis

def sanitize_utf8(text: str) -> str:
    """Ensure text is UTF-8 clean."""
    return text.encode("utf-8", "ignore").decode("utf-8")

def file_hash(path: str) -> str:
    """Generate MD5 hash of file content."""
    with open(path, "rb") as f:
        return hashlib.md5(f.read()).hexdigest()

def extract_all_pages(file_path: str) -> Tuple[str, int]:
    """
    Extract all pages from PDF with smart prioritization of medical sections.
    Returns (extracted_text, total_pages)
    """
    try:
        text_chunks = []
        total_pages = 0
        with pdfplumber.open(file_path) as pdf:
            total_pages = len(pdf.pages)
            
            for i, page in enumerate(pdf.pages):
                page_text = page.extract_text() or ""
                lower_text = page_text.lower()
                
                # Include all pages but mark sections with medical keywords
                if any(re.search(rf'\b{kw}\b', lower_text) for kw in MEDICAL_KEYWORDS):
                    text_chunks.append(f"=== MEDICAL SECTION (Page {i+1}) ===\n{page_text.strip()}")
                else:
                    text_chunks.append(f"=== Page {i+1} ===\n{page_text.strip()}")
        
        return "\n\n".join(text_chunks), total_pages
    except Exception as e:
        return f"PDF processing error: {str(e)}", 0

def convert_file_to_json(file_path: str, file_type: str) -> str:
    """Convert file to JSON format with caching, processing all content."""
    try:
        h = file_hash(file_path)
        cache_path = os.path.join(file_cache_dir, f"{h}.json")
        
        if os.path.exists(cache_path):
            with open(cache_path, "r", encoding="utf-8") as f:
                return f.read()

        if file_type == "pdf":
            text, total_pages = extract_all_pages(file_path)
            result = json.dumps({
                "filename": os.path.basename(file_path),
                "content": text,
                "total_pages": total_pages,
                "status": "complete"
            })
        elif file_type == "csv":
            # Read CSV in chunks to handle large files
            chunks = []
            for chunk in pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
                                   skip_blank_lines=False, on_bad_lines="skip", chunksize=1000):
                chunks.append(chunk.fillna("").astype(str).values.tolist())
            content = [item for sublist in chunks for item in sublist]
            result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
        elif file_type in ["xls", "xlsx"]:
            try:
                # Read Excel in chunks if possible
                df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
            except Exception:
                df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
            content = df.fillna("").astype(str).values.tolist()
            result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
        else:
            result = json.dumps({"error": f"Unsupported file type: {file_type}"})

        with open(cache_path, "w", encoding="utf-8") as f:
            f.write(result)
        return result
    except Exception as e:
        return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})

def log_system_usage(tag=""):
    """Log system resource usage."""
    try:
        cpu = psutil.cpu_percent(interval=1)
        mem = psutil.virtual_memory()
        print(f"[{tag}] CPU: {cpu}% | RAM: {mem.used // (1024**2)}MB / {mem.total // (1024**2)}MB")
        result = subprocess.run(
            ["nvidia-smi", "--query-gpu=memory.used,memory.total,utilization.gpu", "--format=csv,nounits,noheader"],
            capture_output=True, text=True
        )
        if result.returncode == 0:
            used, total, util = result.stdout.strip().split(", ")
            print(f"[{tag}] GPU: {used}MB / {total}MB | Utilization: {util}%")
    except Exception as e:
        print(f"[{tag}] GPU/CPU monitor failed: {e}")

def clean_response(text: str) -> str:
    """Clean and format the model response."""
    text = sanitize_utf8(text)
    # Remove tool calls and JSON artifacts
    text = re.sub(r"\[TOOL_CALLS\].*", "", text, flags=re.DOTALL)
    text = re.sub(r"\['get_[^\]]+\']\n?", "", text)
    text = re.sub(r"\{'meta':\s*\{.*?\}\s*,\s*'results':\s*\[.*?\]\}\n?", "", text, flags=re.DOTALL)
    # Remove repetitive phrases
    text = re.sub(r"To analyze the medical records for clinical oversights.*?begin by reviewing.*?\n", "", text, flags=re.DOTALL)
    # Collapse excessive newlines
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text

def format_final_report(analysis_results: List[str], filename: str) -> str:
    """Combine all analysis chunks into a well-formatted final report."""
    report = []
    report.append(f"COMPREHENSIVE CLINICAL OVERSIGHT ANALYSIS")
    report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append(f"File: {filename}")
    report.append("=" * 80)
    
    # Extract sections from all chunks
    sections = {
        "CRITICAL FINDINGS": [],
        "MISSED DIAGNOSES": [],
        "MEDICATION ISSUES": [],
        "ASSESSMENT GAPS": [],
        "FOLLOW-UP RECOMMENDATIONS": []
    }
    
    for result in analysis_results:
        for section in sections:
            # Find section content using regex
            section_match = re.search(
                rf"{re.escape(section)}:?\s*\n([^*]+?)(?=\n\*|\n\n|$)", 
                result, 
                re.IGNORECASE | re.DOTALL
            )
            if section_match:
                content = section_match.group(1).strip()
                if content and content not in sections[section]:
                    sections[section].append(content)
    
    # Build the final report - prioritize critical findings
    if sections["CRITICAL FINDINGS"]:
        report.append("\n🚨 **CRITICAL FINDINGS** 🚨")
        for content in sections["CRITICAL FINDINGS"]:
            report.append(f"\n{content}")
    
    # Add other sections
    for section, contents in sections.items():
        if section != "CRITICAL FINDINGS" and contents:
            report.append(f"\n**{section.upper()}**")
            for content in contents:
                report.append(f"\n{content}")
    
    if not any(sections.values()):
        report.append("\nNo significant clinical oversights identified.")
    
    report.append("\n" + "=" * 80)
    report.append("END OF REPORT")
    
    return "\n".join(report)

def init_agent():
    """Initialize the TxAgent with proper configuration."""
    print("🔁 Initializing model...")
    log_system_usage("Before Load")
    
    default_tool_path = os.path.abspath("data/new_tool.json")
    target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
    if not os.path.exists(target_tool_path):
        shutil.copy(default_tool_path, target_tool_path)

    agent = TxAgent(
        model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
        rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
        tool_files_dict={"new_tool": target_tool_path},
        force_finish=True,
        enable_checker=True,
        step_rag_num=2,
        seed=100,
        additional_default_tools=[],
    )
    agent.init_model()
    log_system_usage("After Load")
    print("✅ Agent Ready")
    return agent

def analyze_large_document(content: str, filename: str, agent: TxAgent) -> str:
    """Analyze large documents by processing in logical sections."""
    # Split content into logical sections
    sections = re.split(r"(=== MEDICAL SECTION|=== Page \d+ ===)", content)
    sections = [s.strip() for s in sections if s.strip()]
    
    analysis_results = []
    current_chunk = ""
    
    for section in sections:
        # If adding this section would exceed chunk size, analyze current chunk
        if len(current_chunk) + len(section) > CHUNK_SIZE and current_chunk:
            analysis_results.append(process_chunk(current_chunk, filename, agent))
            current_chunk = section
        else:
            current_chunk += "\n\n" + section
    
    # Process the last chunk
    if current_chunk:
        analysis_results.append(process_chunk(current_chunk, filename, agent))
    
    return format_final_report(analysis_results, filename)

def process_chunk(chunk: str, filename: str, agent: TxAgent) -> str:
    """Process a single chunk of the document."""
    prompt = f"""
Analyze this section of medical records for clinical oversights. Focus on:
1. Critical findings needing immediate attention
2. Potential missed diagnoses
3. Medication conflicts
4. Assessment gaps
5. Follow-up recommendations

File: {filename}
Content:
{chunk[:CHUNK_SIZE]}

Provide concise findings in bullet points under relevant headings.
Focus on factual evidence from the content.
"""
    
    full_response = ""
    for output in agent.run_gradio_chat(
        message=prompt,
        history=[],
        temperature=0.1,  # Lower temperature for more factual responses
        max_new_tokens=1024,
        max_token=MAX_TOKENS,
        call_agent=False,
        conversation=[],
    ):
        if output is None:
            continue
        
        if isinstance(output, list):
            for m in output:
                if hasattr(m, 'content') and m.content:
                    cleaned = clean_response(m.content)
                    if cleaned:
                        full_response += cleaned + "\n"
        elif isinstance(output, str) and output.strip():
            cleaned = clean_response(output)
            if cleaned:
                full_response += cleaned + "\n"
    
    return full_response

def create_ui(agent):
    """Create the Gradio interface."""
    with gr.Blocks(theme=gr.themes.Soft(), title="Clinical Oversight Assistant") as demo:
        gr.Markdown("""
        <h1 style='text-align: center;'>🩺 Comprehensive Clinical Oversight Assistant</h1>
        <p style='text-align: center;'>Analyze complete medical records for potential oversights</p>
        """)
        
        with gr.Row():
            with gr.Column(scale=3):
                file_upload = gr.File(
                    file_types=[".pdf", ".csv", ".xls", ".xlsx"], 
                    file_count="multiple",
                    label="Upload Medical Records"
                )
                msg_input = gr.Textbox(
                    placeholder="Optional: Add specific focus areas or questions...",
                    label="Analysis Focus"
                )
                with gr.Row():
                    send_btn = gr.Button("Analyze Full Document", variant="primary")
                    clear_btn = gr.Button("Clear")
                status = gr.Textbox(label="Status", interactive=False)
            
            with gr.Column(scale=7):
                report_output = gr.Textbox(
                    label="Clinical Oversight Report",
                    lines=20,
                    max_lines=50,
                    interactive=False
                )
                download_output = gr.File(
                    label="Download Full Report",
                    visible=False
                )
        
        def analyze(files: List, message: str):
            """Process files and generate analysis."""
            if not files:
                yield "", None, "⚠️ Please upload at least one file to analyze."
                return
            
            yield "", None, "⏳ Processing documents..."
            
            # Process all files completely
            file_contents = []
            filenames = []
            
            with ThreadPoolExecutor(max_workers=4) as executor:
                futures = []
                for f in files:
                    futures.append(executor.submit(
                        convert_file_to_json, 
                        f.name, 
                        f.name.split(".")[-1].lower()
                    ))
                    filenames.append(os.path.basename(f.name))
                
                results = []
                for future in as_completed(futures):
                    results.append(sanitize_utf8(future.result()))
                
                file_contents = results
            
            combined_filename = " + ".join(filenames)
            combined_content = "\n".join([
                json.loads(fc).get("content", "") if "content" in json.loads(fc) 
                else str(json.loads(fc).get("rows", "")) 
                for fc in file_contents
            ])
            
            yield "", None, "🔍 Analyzing content..."
            
            try:
                # Process the complete document
                full_report = analyze_large_document(
                    combined_content, 
                    combined_filename, 
                    agent
                )
                
                # Save report to file
                file_hash_value = hashlib.md5(combined_content.encode()).hexdigest()
                report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt")
                with open(report_path, "w", encoding="utf-8") as f:
                    f.write(full_report)
                
                yield full_report, report_path if os.path.exists(report_path) else None, "✅ Analysis complete!"
            
            except Exception as e:
                error_msg = f"❌ Error during analysis: {str(e)}"
                print(error_msg)
                yield "", None, error_msg
        
        # UI event handlers
        send_btn.click(
            fn=analyze,
            inputs=[file_upload, msg_input],
            outputs=[report_output, download_output, status],
            api_name="analyze"
        )
        
        clear_btn.click(
            fn=lambda: ("", None, ""),
            inputs=None,
            outputs=[report_output, download_output, status]
        )
    
    return demo

if __name__ == "__main__":
    print("🚀 Launching app...")
    agent = init_agent()
    demo = create_ui(agent)
    demo.queue(
        api_open=False,
        max_size=20
    ).launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True,
        allowed_paths=[report_dir],
        share=False
    )