CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 13

Commit

abc4511

verified ·

1 Parent(s): 6b734c9

Update app.py

Browse files

Files changed (1) hide show

app.py +223 -155

app.py CHANGED Viewed

@@ -1,192 +1,260 @@
 import sys
 import os
 import gradio as gr
 import hashlib
 import time
-import json
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import pandas as pd
-import pdfplumber
-# Set up environment
 os.environ.update({
-    "HF_HOME": "/data/hf_cache",
-    "TOKENIZERS_PARALLELISM": "false"
 })
-# Create cache directories
-os.makedirs("/data/hf_cache", exist_ok=True)
-os.makedirs("/data/file_cache", exist_ok=True)
-os.makedirs("/data/reports", exist_ok=True)
-# Import TxAgent after setting up environment
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "src")))
 from txagent.txagent import TxAgent
-# Initialize agent with error handling
-try:
-    agent = TxAgent(
-        model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
-        rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
-        tool_files_dict={"new_tool": "/data/tool_cache/new_tool.json"},
-        force_finish=True,
-        enable_checker=True,
-        step_rag_num=8,
-        seed=100
-    )
-    agent.init_model()
-except Exception as e:
-    print(f"Failed to initialize agent: {str(e)}")
-    agent = None
 def file_hash(path: str) -> str:
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
-def extract_text_from_pdf(file_path: str, max_pages: int = 10) -> str:
     try:
         with pdfplumber.open(file_path) as pdf:
-            return "\n".join(
-                f"Page {i+1}:\n{(page.extract_text() or '').strip()}\n"
-                for i, page in enumerate(pdf.pages[:max_pages])
-            )
     except Exception as e:
-        return f"PDF error: {str(e)}"
-def process_file(file_path: str, file_type: str) -> str:
     try:
-        cache_path = f"/data/file_cache/{file_hash(file_path)}.json"
         if os.path.exists(cache_path):
-            with open(cache_path, "r") as f:
-                return f.read()
         if file_type == "pdf":
-            content = extract_text_from_pdf(file_path)
         elif file_type == "csv":
-            df = pd.read_csv(file_path, header=None, dtype=str, on_bad_lines="skip")
-            content = df.fillna("").to_string()
         elif file_type in ["xls", "xlsx"]:
-            df = pd.read_excel(file_path, header=None, dtype=str)
-            content = df.fillna("").to_string()
         else:
-            return json.dumps({"error": "Unsupported file type"})
-        result = json.dumps({"filename": os.path.basename(file_path), "content": content})
-        with open(cache_path, "w") as f:
             f.write(result)
         return result
     except Exception as e:
-        return json.dumps({"error": str(e)})
-def format_response(response: str) -> str:
-    response = response.replace("[TOOL_CALLS]", "").strip()
-    sections = {
-        "1. **Missed Diagnoses**:": "🔍 Missed Diagnoses",
-        "2. **Medication Conflicts**:": "💊 Medication Conflicts",
-        "3. **Incomplete Assessments**:": "📋 Incomplete Assessments",
-        "4. **Abnormal Results Needing Follow-up**:": "⚠️ Abnormal Results"
-    }
-    for old, new in sections.items():
-        response = response.replace(old, f"\n### {new}\n")
-    return response
-def analyze(message: str, history: list, files: list):
-    if agent is None:
-        yield history + [(message, "Agent initialization failed. Please try again later.")], None
-        return
-    history.append((message, None))
-    yield history, None
     try:
-        extracted_data = ""
-        if files:
-            with ThreadPoolExecutor() as executor:
-                futures = [executor.submit(process_file, f.name, f.name.split(".")[-1])
-                         for f in files if hasattr(f, 'name')]
-                extracted_data = "\n".join(f.result() for f in as_completed(futures))
-        prompt = f"""Review these medical records:
-{extracted_data[:10000]}
-Identify potential issues:
-1. Missed diagnoses
-2. Medication conflicts
-3. Incomplete assessments
-4. Abnormal results needing follow-up
-Analysis:"""
-        response = ""
-        for chunk in agent.run_gradio_chat(
-            message=prompt,
-            history=[],
-            temperature=0.2,
-            max_new_tokens=800
-        ):
-            if isinstance(chunk, str):
-                response += chunk
-            elif isinstance(chunk, list):
-                response += "".join(getattr(c, 'content', '') for c in chunk)
-            history[-1] = (message, format_response(response))
-            yield history, None
-        history[-1] = (message, format_response(response))
-        yield history, None
     except Exception as e:
-        history[-1] = (message, f"❌ Error: {str(e)}")
-        yield history, None
-# Create the interface
-with gr.Blocks(
-    title="Clinical Oversight Assistant",
-    css="""
-    .gradio-container {
-        max-width: 1000px;
-        margin: auto;
-    }
-    .chatbot {
-        min-height: 500px;
-    }
-    """
-) as demo:
-    gr.Markdown("# 🩺 Clinical Oversight Assistant")
-    with gr.Row():
-        with gr.Column(scale=1):
-            files = gr.File(
-                label="Upload Medical Records",
-                file_types=[".pdf", ".csv", ".xlsx"],
-                file_count="multiple"
-            )
-            query = gr.Textbox(
-                label="Your Query",
-                placeholder="Ask about potential oversights..."
-            )
-            submit = gr.Button("Analyze", variant="primary")
-        with gr.Column(scale=2):
-            chatbot = gr.Chatbot(
-                label="Analysis Results",
-                show_copy_button=True
-            )
-    submit.click(
-        analyze,
-        inputs=[query, chatbot, files],
-        outputs=[chatbot, gr.File(visible=False)]
-    )
-    query.submit(
-        analyze,
-        inputs=[query, chatbot, files],
-        outputs=[chatbot, gr.File(visible=False)]
     )
 if __name__ == "__main__":
-    demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        show_error=True
-    )

 import sys
 import os
+import pandas as pd
+import pdfplumber
+import json
 import gradio as gr
+from typing import List, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
 import hashlib
+import shutil
 import time
+from functools import lru_cache
+from threading import Thread
+import re
+import tempfile
+# Environment setup
+current_dir = os.path.dirname(os.path.abspath(__file__))
+src_path = os.path.abspath(os.path.join(current_dir, "src"))
+sys.path.insert(0, src_path)
+# Cache directories
+base_dir = "/data"
+os.makedirs(base_dir, exist_ok=True)
+model_cache_dir = os.path.join(base_dir, "txagent_models")
+tool_cache_dir = os.path.join(base_dir, "tool_cache")
+file_cache_dir = os.path.join(base_dir, "cache")
+report_dir = "/data/reports"
+vllm_cache_dir = os.path.join(base_dir, "vllm_cache")
+os.makedirs(model_cache_dir, exist_ok=True)
+os.makedirs(tool_cache_dir, exist_ok=True)
+os.makedirs(file_cache_dir, exist_ok=True)
+os.makedirs(report_dir, exist_ok=True)
+os.makedirs(vllm_cache_dir, exist_ok=True)
 os.environ.update({
+    "TRANSFORMERS_CACHE": model_cache_dir,
+    "HF_HOME": model_cache_dir,
+    "VLLM_CACHE_DIR": vllm_cache_dir,
+    "TOKENIZERS_PARALLELISM": "false",
+    "CUDA_LAUNCH_BLOCKING": "1"
 })
 from txagent.txagent import TxAgent
+MEDICAL_KEYWORDS = {
+    'diagnosis', 'assessment', 'plan', 'results', 'medications',
+    'allergies', 'summary', 'impression', 'findings', 'recommendations'
+}
+def sanitize_utf8(text: str) -> str:
+    return text.encode("utf-8", "ignore").decode("utf-8")
 def file_hash(path: str) -> str:
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
+def extract_priority_pages(file_path: str, max_pages: int = 20) -> str:
     try:
+        text_chunks = []
         with pdfplumber.open(file_path) as pdf:
+            for i, page in enumerate(pdf.pages[:3]):
+                text_chunks.append(f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}")
+            for i, page in enumerate(pdf.pages[3:max_pages], start=4):
+                page_text = page.extract_text() or ""
+                if any(re.search(rf'\b{kw}\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
+                    text_chunks.append(f"=== Page {i} ===\n{page_text.strip()}")
+        return "\n\n".join(text_chunks)
     except Exception as e:
+        return f"PDF processing error: {str(e)}"
+def convert_file_to_json(file_path: str, file_type: str) -> str:
     try:
+        h = file_hash(file_path)
+        cache_path = os.path.join(file_cache_dir, f"{h}.json")
         if os.path.exists(cache_path):
+            return open(cache_path, "r", encoding="utf-8").read()
         if file_type == "pdf":
+            text = extract_priority_pages(file_path)
+            result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
+            Thread(target=full_pdf_processing, args=(file_path, h)).start()
         elif file_type == "csv":
+            df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str, skip_blank_lines=False, on_bad_lines="skip")
+            content = df.fillna("").astype(str).values.tolist()
+            result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         elif file_type in ["xls", "xlsx"]:
+            try:
+                df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
+            except Exception:
+                df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
+            content = df.fillna("").astype(str).values.tolist()
+            result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         else:
+            return json.dumps({"error": f"Unsupported file type: {file_type}"})
+        with open(cache_path, "w", encoding="utf-8") as f:
             f.write(result)
         return result
     except Exception as e:
+        return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
+def full_pdf_processing(file_path: str, file_hash: str):
     try:
+        cache_path = os.path.join(file_cache_dir, f"{file_hash}_full.json")
+        if os.path.exists(cache_path):
+            return
+        with pdfplumber.open(file_path) as pdf:
+            full_text = "\n".join([f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}" for i, page in enumerate(pdf.pages)])
+        result = json.dumps({"filename": os.path.basename(file_path), "content": full_text, "status": "complete"})
+        with open(cache_path, "w", encoding="utf-8") as f:
+            f.write(result)
+        with open(os.path.join(report_dir, f"{file_hash}_report.txt"), "w", encoding="utf-8") as out:
+            out.write(full_text)
     except Exception as e:
+        print(f"Background processing failed: {str(e)}")
+def init_agent():
+    default_tool_path = os.path.abspath("data/new_tool.json")
+    target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
+    if not os.path.exists(target_tool_path):
+        shutil.copy(default_tool_path, target_tool_path)
+    agent = TxAgent(
+        model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
+        rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
+        tool_files_dict={"new_tool": target_tool_path},
+        force_finish=True,
+        enable_checker=True,
+        step_rag_num=8,
+        seed=100,
+        additional_default_tools=[],
     )
+    agent.init_model()
+    return agent
+def create_ui(agent: TxAgent):
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        <h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>
+        <h3 style='text-align: center;'>Identify potential oversights in patient care</h3>
+        """)
+        chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
+        file_upload = gr.File(label="Upload Medical Records", file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
+        msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
+        send_btn = gr.Button("Analyze", variant="primary")
+        conversation_state = gr.State([])
+        download_output = gr.File(label="Download Full Report")
+        def analyze_potential_oversights(message: str, history: list, conversation: list, files: list):
+            start_time = time.time()
+            try:
+                # Add initial user and temporary assistant messages to update UI immediately
+                history = history + [
+                    {"role": "user", "content": message},
+                    {"role": "assistant", "content": "⏳ Analyzing records for potential oversights..."}
+                ]
+                yield history, None
+                extracted_data = ""
+                file_hash_value = ""
+                if files and isinstance(files, list):
+                    with ThreadPoolExecutor(max_workers=4) as executor:
+                        futures = [
+                            executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower())
+                            for f in files if hasattr(f, 'name')
+                        ]
+                        extracted_data = "\n".join([sanitize_utf8(f.result()) for f in as_completed(futures)])
+                        file_hash_value = file_hash(files[0].name) if hasattr(files[0], 'name') else ""
+                # Truncate extracted data to reduce overall token count (tune the character limit as needed)
+                max_extracted_chars = 12000
+                truncated_data = extracted_data[:max_extracted_chars]
+                analysis_prompt = f"""Review these medical records and identify EXACTLY what might have been missed:
+1. List potential missed diagnoses
+2. Flag any medication conflicts
+3. Note incomplete assessments
+4. Highlight abnormal results needing follow-up
+Medical Records:
+{truncated_data}
+### Potential Oversights:
+"""
+                response = ""
+                try:
+                    # Stream the agent responses; skip any None chunks
+                    for chunk in agent.run_gradio_chat(
+                        message=analysis_prompt,
+                        history=[],
+                        temperature=0.2,
+                        max_new_tokens=1024,
+                        max_token=4096,
+                        call_agent=False,
+                        conversation=conversation
+                    ):
+                        if chunk is None:
+                            continue
+                        if isinstance(chunk, str):
+                            response += chunk
+                        elif isinstance(chunk, list):
+                            response += "".join([c.content for c in chunk if hasattr(c, 'content')])
+                        # Yield partial response updates
+                        cleaned = response.replace("[TOOL_CALLS]", "").strip()
+                        yield history[:-1] + [{"role": "assistant", "content": cleaned}], None
+                except Exception as agent_error:
+                    history.append({"role": "assistant", "content": f"❌ Analysis failed during processing: {str(agent_error)}"})
+                    yield history, None
+                    return
+                final_output = response.replace("[TOOL_CALLS]", "").strip()
+                if not final_output:
+                    final_output = "No clear oversights identified. Recommend comprehensive review."
+                report_path = None
+                if file_hash_value:
+                    possible_report = os.path.join(report_dir, f"{file_hash_value}_report.txt")
+                    if os.path.exists(possible_report):
+                        report_path = possible_report
+                history = history[:-1] + [{"role": "assistant", "content": final_output}]
+                yield history, report_path
+            except Exception as e:
+                history.append({"role": "assistant", "content": f"❌ Analysis failed: {str(e)}"})
+                yield history, None
+        inputs = [msg_input, chatbot, conversation_state, file_upload]
+        outputs = [chatbot, download_output]
+        send_btn.click(analyze_potential_oversights, inputs=inputs, outputs=outputs)
+        msg_input.submit(analyze_potential_oversights, inputs=inputs, outputs=outputs)
+        gr.Examples([
+            ["What might have been missed in this patient's treatment?"],
+            ["Are there any medication conflicts in these records?"],
+            ["What abnormal results require follow-up?"]
+        ], inputs=msg_input)
+    return demo
 if __name__ == "__main__":
+    print("Initializing medical analysis agent...")
+    agent = init_agent()
+    print("Launching interface...")
+    demo = create_ui(agent)
+    demo.queue(api_open=False).launch(
         server_name="0.0.0.0",
         server_port=7860,
+        show_error=True,
+        allowed_paths=["/data/reports"],
+        share=False
+    )