CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 20

Commit

44280bd

verified ·

1 Parent(s): 02a4d5e

Update app.py

Browse files

Files changed (1) hide show

app.py +282 -138

app.py CHANGED Viewed

@@ -4,15 +4,16 @@ import pandas as pd
 import pdfplumber
 import json
 import gradio as gr
-from typing import List
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import hashlib
 import shutil
 import re
 import psutil
 import subprocess
-# Persistent directory
 persistent_dir = "/data/hf_cache"
 os.makedirs(persistent_dir, exist_ok=True)
@@ -37,46 +38,78 @@ sys.path.insert(0, src_path)
 from txagent.txagent import TxAgent
-MEDICAL_KEYWORDS = {'diagnosis', 'assessment', 'plan', 'results', 'medications',
-                    'allergies', 'summary', 'impression', 'findings', 'recommendations'}
 def sanitize_utf8(text: str) -> str:
     return text.encode("utf-8", "ignore").decode("utf-8")
 def file_hash(path: str) -> str:
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
-def extract_priority_pages(file_path: str) -> str:
     try:
         text_chunks = []
         with pdfplumber.open(file_path) as pdf:
             for i, page in enumerate(pdf.pages):
                 page_text = page.extract_text() or ""
-                if i < 3 or any(re.search(rf'\b{kw}\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
                     text_chunks.append(f"=== Page {i+1} ===\n{page_text.strip()}")
-        return "\n\n".join(text_chunks)
     except Exception as e:
-        return f"PDF processing error: {str(e)}"
 def convert_file_to_json(file_path: str, file_type: str) -> str:
     try:
         h = file_hash(file_path)
         cache_path = os.path.join(file_cache_dir, f"{h}.json")
         if os.path.exists(cache_path):
             with open(cache_path, "r", encoding="utf-8") as f:
                 return f.read()
         if file_type == "pdf":
-            text = extract_priority_pages(file_path)
-            result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
         elif file_type == "csv":
-            df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
-                             skip_blank_lines=False, on_bad_lines="skip")
-            content = df.fillna("").astype(str).values.tolist()
             result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         elif file_type in ["xls", "xlsx"]:
             try:
                 df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
             except Exception:
                 df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
@@ -84,6 +117,7 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
             result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         else:
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
         with open(cache_path, "w", encoding="utf-8") as f:
             f.write(result)
         return result
@@ -91,6 +125,7 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
         return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
 def log_system_usage(tag=""):
     try:
         cpu = psutil.cpu_percent(interval=1)
         mem = psutil.virtual_memory()
@@ -106,21 +141,74 @@ def log_system_usage(tag=""):
         print(f"[{tag}] GPU/CPU monitor failed: {e}")
 def clean_response(text: str) -> str:
     text = sanitize_utf8(text)
-    # Remove tool calls, JSON data, and repetitive phrases
     text = re.sub(r"\[TOOL_CALLS\].*", "", text, flags=re.DOTALL)
-    text = re.sub(r"\['get_[^\]]+\']\n?", "", text)  # Remove tool names
-    text = re.sub(r"\{'meta':\s*\{.*?\}\s*,\s*'results':\s*\[.*?\]\}\n?", "", text, flags=re.DOTALL)  # Remove JSON
     text = re.sub(r"To analyze the medical records for clinical oversights.*?begin by reviewing.*?\n", "", text, flags=re.DOTALL)
     text = re.sub(r"\n{3,}", "\n\n", text).strip()
-    # Only keep text under analysis headings or relevant content
-    if not re.search(r"(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", text):
-        return ""
     return text
 def init_agent():
     print("🔁 Initializing model...")
     log_system_usage("Before Load")
     default_tool_path = os.path.abspath("data/new_tool.json")
     target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
     if not os.path.exists(target_tool_path):
@@ -141,135 +229,191 @@ def init_agent():
     print("✅ Agent Ready")
     return agent
-def create_ui(agent):
-    with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
-        chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
-        file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
-        msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
-        send_btn = gr.Button("Analyze", variant="primary")
-        download_output = gr.File(label="Download Full Report")
-        def analyze(message: str, history: List[dict], files: List):
-            history.append({"role": "user", "content": message})
-            history.append({"role": "assistant", "content": "⏳ Analyzing records for potential oversights..."})
-            yield history, None
-            extracted = ""
-            file_hash_value = ""
-            if files:
-                with ThreadPoolExecutor(max_workers=6) as executor:
-                    futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower()) for f in files]
-                    results = [sanitize_utf8(f.result()) for f in as_completed(futures)]
-                    extracted = "\n".join(results)
-                    file_hash_value = file_hash(files[0].name) if files else ""
-            # Split extracted text into chunks of ~4,000 characters
-            chunk_size = 4000
-            chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
-            combined_response = ""
-            prompt_template = f"""
-Analyze the medical records for clinical oversights. Provide a concise, evidence-based summary under these headings:
-1. **Missed Diagnoses**:
-   - Identify inconsistencies in history, symptoms, or tests.
-   - Consider psychiatric, neurological, infectious, autoimmune, genetic conditions, family history, trauma, and developmental factors.
-2. **Medication Conflicts**:
-   - Check for contraindications, interactions, or unjustified off-label use.
-   - Assess if medications worsen diagnoses or cause adverse effects.
-3. **Incomplete Assessments**:
-   - Note missing or superficial cognitive, psychiatric, social, or family assessments.
-   - Highlight gaps in medical history, substance use, or lab/imaging documentation.
-4. **Urgent Follow-up**:
-   - Flag abnormal lab results, imaging, behaviors, or legal history needing immediate reassessment or referral.
-Medical Records (Chunk {0} of {1}):
-{{chunk}}
-Begin analysis:
 """
             try:
-                if history and history[-1]["content"].startswith("⏳"):
-                    history.pop()
-                # Process each chunk and stream cleaned results
-                for chunk_idx, chunk in enumerate(chunks, 1):
-                    # Update UI with progress
-                    history.append({"role": "assistant", "content": f"🔄 Processing Chunk {chunk_idx} of {len(chunks)}..."})
-                    yield history, None
-                    prompt = prompt_template.format(chunk_idx, len(chunks), chunk=chunk)
-                    chunk_response = ""
-                    for chunk_output in agent.run_gradio_chat(
-                        message=prompt,
-                        history=[],
-                        temperature=0.2,
-                        max_new_tokens=1024,
-                        max_token=4096,
-                        call_agent=False,
-                        conversation=[],
-                    ):
-                        if chunk_output is None:
-                            continue
-                        if isinstance(chunk_output, list):
-                            for m in chunk_output:
-                                if hasattr(m, 'content') and m.content:
-                                    cleaned = clean_response(m.content)
-                                    if cleaned:
-                                        chunk_response += cleaned + "\n"
-                                        # Stream partial response to UI
-                                        if history[-1]["content"].startswith("🔄"):
-                                            history[-1] = {"role": "assistant", "content": f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"}
-                                        else:
-                                            history[-1]["content"] = f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"
-                                        yield history, None
-                        elif isinstance(chunk_output, str) and chunk_output.strip():
-                            cleaned = clean_response(chunk_output)
-                            if cleaned:
-                                chunk_response += cleaned + "\n"
-                                # Stream partial response to UI
-                                if history[-1]["content"].startswith("🔄"):
-                                    history[-1] = {"role": "assistant", "content": f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"}
-                                else:
-                                    history[-1]["content"] = f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"
-                                yield history, None
-                    # Append completed chunk response to combined response
-                    if chunk_response:
-                        combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
-                # Finalize UI with complete response
-                if combined_response:
-                    history[-1]["content"] = combined_response.strip()
-                else:
-                    history.append({"role": "assistant", "content": "No oversights identified."})
-                # Generate report file with cleaned response
-                report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
-                if report_path:
-                    with open(report_path, "w", encoding="utf-8") as f:
-                        f.write(combined_response)
-                yield history, report_path if report_path and os.path.exists(report_path) else None
             except Exception as e:
-                print("🚨 ERROR:", e)
-                history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
-                yield history, None
-        send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output])
-        msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output])
     return demo
 if __name__ == "__main__":
     print("🚀 Launching app...")
     agent = init_agent()
     demo = create_ui(agent)
-    demo.queue(api_open=False).launch(
         server_name="0.0.0.0",
         server_port=7860,
         show_error=True,

 import pdfplumber
 import json
 import gradio as gr
+from typing import List, Tuple, Optional
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import hashlib
 import shutil
 import re
 import psutil
 import subprocess
+from datetime import datetime
+# Persistent directory setup
 persistent_dir = "/data/hf_cache"
 os.makedirs(persistent_dir, exist_ok=True)
 from txagent.txagent import TxAgent
+# Constants
+MEDICAL_KEYWORDS = {
+    'diagnosis', 'assessment', 'plan', 'results', 'medications',
+    'allergies', 'summary', 'impression', 'findings', 'recommendations',
+    'conclusion', 'history', 'examination', 'progress', 'discharge'
+}
+CHUNK_SIZE = 10000  # Increased chunk size for better context
+MAX_TOKENS = 12000  # Maximum tokens for analysis
 def sanitize_utf8(text: str) -> str:
+    """Ensure text is UTF-8 clean."""
     return text.encode("utf-8", "ignore").decode("utf-8")
 def file_hash(path: str) -> str:
+    """Generate MD5 hash of file content."""
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
+def extract_all_pages(file_path: str) -> Tuple[str, int]:
+    """
+    Extract all pages from PDF with smart prioritization of medical sections.
+    Returns (extracted_text, total_pages)
+    """
     try:
         text_chunks = []
+        total_pages = 0
         with pdfplumber.open(file_path) as pdf:
+            total_pages = len(pdf.pages)
             for i, page in enumerate(pdf.pages):
                 page_text = page.extract_text() or ""
+                lower_text = page_text.lower()
+                # Include all pages but mark sections with medical keywords
+                if any(re.search(rf'\b{kw}\b', lower_text) for kw in MEDICAL_KEYWORDS):
+                    text_chunks.append(f"=== MEDICAL SECTION (Page {i+1}) ===\n{page_text.strip()}")
+                else:
                     text_chunks.append(f"=== Page {i+1} ===\n{page_text.strip()}")
+        return "\n\n".join(text_chunks), total_pages
     except Exception as e:
+        return f"PDF processing error: {str(e)}", 0
 def convert_file_to_json(file_path: str, file_type: str) -> str:
+    """Convert file to JSON format with caching, processing all content."""
     try:
         h = file_hash(file_path)
         cache_path = os.path.join(file_cache_dir, f"{h}.json")
         if os.path.exists(cache_path):
             with open(cache_path, "r", encoding="utf-8") as f:
                 return f.read()
         if file_type == "pdf":
+            text, total_pages = extract_all_pages(file_path)
+            result = json.dumps({
+                "filename": os.path.basename(file_path),
+                "content": text,
+                "total_pages": total_pages,
+                "status": "complete"
+            })
         elif file_type == "csv":
+            # Read CSV in chunks to handle large files
+            chunks = []
+            for chunk in pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
+                                   skip_blank_lines=False, on_bad_lines="skip", chunksize=1000):
+                chunks.append(chunk.fillna("").astype(str).values.tolist())
+            content = [item for sublist in chunks for item in sublist]
             result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         elif file_type in ["xls", "xlsx"]:
             try:
+                # Read Excel in chunks if possible
                 df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
             except Exception:
                 df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
             result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         else:
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
         with open(cache_path, "w", encoding="utf-8") as f:
             f.write(result)
         return result
         return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
 def log_system_usage(tag=""):
+    """Log system resource usage."""
     try:
         cpu = psutil.cpu_percent(interval=1)
         mem = psutil.virtual_memory()
         print(f"[{tag}] GPU/CPU monitor failed: {e}")
 def clean_response(text: str) -> str:
+    """Clean and format the model response."""
     text = sanitize_utf8(text)
+    # Remove tool calls and JSON artifacts
     text = re.sub(r"\[TOOL_CALLS\].*", "", text, flags=re.DOTALL)
+    text = re.sub(r"\['get_[^\]]+\']\n?", "", text)
+    text = re.sub(r"\{'meta':\s*\{.*?\}\s*,\s*'results':\s*\[.*?\]\}\n?", "", text, flags=re.DOTALL)
+    # Remove repetitive phrases
     text = re.sub(r"To analyze the medical records for clinical oversights.*?begin by reviewing.*?\n", "", text, flags=re.DOTALL)
+    # Collapse excessive newlines
     text = re.sub(r"\n{3,}", "\n\n", text).strip()
     return text
+def format_final_report(analysis_results: List[str], filename: str) -> str:
+    """Combine all analysis chunks into a well-formatted final report."""
+    report = []
+    report.append(f"COMPREHENSIVE CLINICAL OVERSIGHT ANALYSIS")
+    report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    report.append(f"File: {filename}")
+    report.append("=" * 80)
+    # Extract sections from all chunks
+    sections = {
+        "CRITICAL FINDINGS": [],
+        "MISSED DIAGNOSES": [],
+        "MEDICATION ISSUES": [],
+        "ASSESSMENT GAPS": [],
+        "FOLLOW-UP RECOMMENDATIONS": []
+    }
+    for result in analysis_results:
+        for section in sections:
+            # Find section content using regex
+            section_match = re.search(
+                rf"{re.escape(section)}:?\s*\n([^*]+?)(?=\n\*|\n\n|$)",
+                result,
+                re.IGNORECASE | re.DOTALL
+            )
+            if section_match:
+                content = section_match.group(1).strip()
+                if content and content not in sections[section]:
+                    sections[section].append(content)
+    # Build the final report - prioritize critical findings
+    if sections["CRITICAL FINDINGS"]:
+        report.append("\n🚨 **CRITICAL FINDINGS** 🚨")
+        for content in sections["CRITICAL FINDINGS"]:
+            report.append(f"\n{content}")
+    # Add other sections
+    for section, contents in sections.items():
+        if section != "CRITICAL FINDINGS" and contents:
+            report.append(f"\n**{section.upper()}**")
+            for content in contents:
+                report.append(f"\n{content}")
+    if not any(sections.values()):
+        report.append("\nNo significant clinical oversights identified.")
+    report.append("\n" + "=" * 80)
+    report.append("END OF REPORT")
+    return "\n".join(report)
 def init_agent():
+    """Initialize the TxAgent with proper configuration."""
     print("🔁 Initializing model...")
     log_system_usage("Before Load")
     default_tool_path = os.path.abspath("data/new_tool.json")
     target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
     if not os.path.exists(target_tool_path):
     print("✅ Agent Ready")
     return agent
+def analyze_large_document(content: str, filename: str, agent: TxAgent) -> str:
+    """Analyze large documents by processing in logical sections."""
+    # Split content into logical sections
+    sections = re.split(r"(=== MEDICAL SECTION|=== Page \d+ ===)", content)
+    sections = [s.strip() for s in sections if s.strip()]
+    analysis_results = []
+    current_chunk = ""
+    for section in sections:
+        # If adding this section would exceed chunk size, analyze current chunk
+        if len(current_chunk) + len(section) > CHUNK_SIZE and current_chunk:
+            analysis_results.append(process_chunk(current_chunk, filename, agent))
+            current_chunk = section
+        else:
+            current_chunk += "\n\n" + section
+    # Process the last chunk
+    if current_chunk:
+        analysis_results.append(process_chunk(current_chunk, filename, agent))
+    return format_final_report(analysis_results, filename)
+def process_chunk(chunk: str, filename: str, agent: TxAgent) -> str:
+    """Process a single chunk of the document."""
+    prompt = f"""
+Analyze this section of medical records for clinical oversights. Focus on:
+1. Critical findings needing immediate attention
+2. Potential missed diagnoses
+3. Medication conflicts
+4. Assessment gaps
+5. Follow-up recommendations
+File: {filename}
+Content:
+{chunk[:CHUNK_SIZE]}
+Provide concise findings in bullet points under relevant headings.
+Focus on factual evidence from the content.
 """
+    full_response = ""
+    for output in agent.run_gradio_chat(
+        message=prompt,
+        history=[],
+        temperature=0.1,  # Lower temperature for more factual responses
+        max_new_tokens=1024,
+        max_token=MAX_TOKENS,
+        call_agent=False,
+        conversation=[],
+    ):
+        if output is None:
+            continue
+        if isinstance(output, list):
+            for m in output:
+                if hasattr(m, 'content') and m.content:
+                    cleaned = clean_response(m.content)
+                    if cleaned:
+                        full_response += cleaned + "\n"
+        elif isinstance(output, str) and output.strip():
+            cleaned = clean_response(output)
+            if cleaned:
+                full_response += cleaned + "\n"
+    return full_response
+def create_ui(agent):
+    """Create the Gradio interface."""
+    with gr.Blocks(theme=gr.themes.Soft(), title="Clinical Oversight Assistant") as demo:
+        gr.Markdown("""
+        <h1 style='text-align: center;'>🩺 Comprehensive Clinical Oversight Assistant</h1>
+        <p style='text-align: center;'>Analyze complete medical records for potential oversights</p>
+        """)
+        with gr.Row():
+            with gr.Column(scale=3):
+                file_upload = gr.File(
+                    file_types=[".pdf", ".csv", ".xls", ".xlsx"],
+                    file_count="multiple",
+                    label="Upload Medical Records"
+                )
+                msg_input = gr.Textbox(
+                    placeholder="Optional: Add specific focus areas or questions...",
+                    label="Analysis Focus"
+                )
+                with gr.Row():
+                    send_btn = gr.Button("Analyze Full Document", variant="primary")
+                    clear_btn = gr.Button("Clear")
+                status = gr.Textbox(label="Status", interactive=False)
+            with gr.Column(scale=7):
+                report_output = gr.Textbox(
+                    label="Clinical Oversight Report",
+                    lines=20,
+                    max_lines=50,
+                    interactive=False
+                )
+                download_output = gr.File(
+                    label="Download Full Report",
+                    visible=False
+                )
+        def analyze(files: List, message: str):
+            """Process files and generate analysis."""
+            if not files:
+                yield "", None, "⚠️ Please upload at least one file to analyze."
+                return
+            yield "", None, "⏳ Processing documents..."
+            # Process all files completely
+            file_contents = []
+            filenames = []
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                futures = []
+                for f in files:
+                    futures.append(executor.submit(
+                        convert_file_to_json,
+                        f.name,
+                        f.name.split(".")[-1].lower()
+                    ))
+                    filenames.append(os.path.basename(f.name))
+                results = []
+                for future in as_completed(futures):
+                    results.append(sanitize_utf8(future.result()))
+                file_contents = results
+            combined_filename = " + ".join(filenames)
+            combined_content = "\n".join([
+                json.loads(fc).get("content", "") if "content" in json.loads(fc)
+                else str(json.loads(fc).get("rows", ""))
+                for fc in file_contents
+            ])
+            yield "", None, "🔍 Analyzing content..."
             try:
+                # Process the complete document
+                full_report = analyze_large_document(
+                    combined_content,
+                    combined_filename,
+                    agent
+                )
+                # Save report to file
+                file_hash_value = hashlib.md5(combined_content.encode()).hexdigest()
+                report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt")
+                with open(report_path, "w", encoding="utf-8") as f:
+                    f.write(full_report)
+                yield full_report, report_path if os.path.exists(report_path) else None, "✅ Analysis complete!"
             except Exception as e:
+                error_msg = f"❌ Error during analysis: {str(e)}"
+                print(error_msg)
+                yield "", None, error_msg
+        # UI event handlers
+        send_btn.click(
+            fn=analyze,
+            inputs=[file_upload, msg_input],
+            outputs=[report_output, download_output, status],
+            api_name="analyze"
+        )
+        clear_btn.click(
+            fn=lambda: ("", None, ""),
+            inputs=None,
+            outputs=[report_output, download_output, status]
+        )
     return demo
 if __name__ == "__main__":
     print("🚀 Launching app...")
     agent = init_agent()
     demo = create_ui(agent)
+    demo.queue(
+        api_open=False,
+        max_size=20
+    ).launch(
         server_name="0.0.0.0",
         server_port=7860,
         show_error=True,