Spaces:

RocketFarmStudios
/

TxAgent-Api

Paused

App Files Files Community

Ali2206 commited on May 18

Commit

32e4e6a

verified ·

1 Parent(s): c4e7e4a

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -348

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import json
 import shutil
@@ -24,18 +25,24 @@ tool_cache_dir = os.path.join(persistent_dir, "tool_cache")
 file_cache_dir = os.path.join(persistent_dir, "cache")
 report_dir = os.path.join(persistent_dir, "reports")
 for d in [model_cache_dir, tool_cache_dir, file_cache_dir, report_dir]:
     os.makedirs(d, exist_ok=True)
 os.environ["HF_HOME"] = model_cache_dir
 os.environ["TRANSFORMERS_CACHE"] = model_cache_dir
 current_dir = os.path.dirname(os.path.abspath(__file__))
 src_path = os.path.abspath(os.path.join(current_dir, "src"))
 sys.path.insert(0, src_path)
 from txagent.txagent import TxAgent
 MAX_MODEL_TOKENS = 131072
 MAX_NEW_TOKENS = 4096
 MAX_CHUNK_TOKENS = 8192
@@ -43,9 +50,12 @@ BATCH_SIZE = 1
 PROMPT_OVERHEAD = 300
 SAFE_SLEEP = 0.5
-app = FastAPI(title="Clinical Patient Support System API",
-              description="API for analyzing and summarizing unstructured medical files",
-              version="1.0.0")
 # CORS configuration
 app.add_middleware(
@@ -62,107 +72,17 @@ agent = None
 @app.on_event("startup")
 async def startup_event():
     global agent
-    agent = init_agent()
-def estimate_tokens(text: str) -> int:
-    return len(text) // 4 + 1
-def clean_response(text: str) -> str:
-    text = re.sub(r"\[.*?\]|\bNone\b", "", text, flags=re.DOTALL)
-    text = re.sub(r"\n{3,}", "\n\n", text)
-    return text.strip()
-def remove_duplicate_paragraphs(text: str) -> str:
-    paragraphs = text.strip().split("\n\n")
-    seen = set()
-    unique_paragraphs = []
-    for p in paragraphs:
-        clean_p = p.strip()
-        if clean_p and clean_p not in seen:
-            unique_paragraphs.append(clean_p)
-            seen.add(clean_p)
-    return "\n\n".join(unique_paragraphs)
-def extract_text_from_excel(path: str) -> str:
-    all_text = []
-    xls = pd.ExcelFile(path)
-    for sheet_name in xls.sheet_names:
-        try:
-            df = xls.parse(sheet_name).astype(str).fillna("")
-        except Exception:
-            continue
-        for _, row in df.iterrows():
-            non_empty = [cell.strip() for cell in row if cell.strip()]
-            if len(non_empty) >= 2:
-                text_line = " | ".join(non_empty)
-                if len(text_line) > 15:
-                    all_text.append(f"[{sheet_name}] {text_line}")
-    return "\n".join(all_text)
-def extract_text_from_csv(path: str) -> str:
-    all_text = []
     try:
-        df = pd.read_csv(path).astype(str).fillna("")
-    except Exception:
-        return ""
-    for _, row in df.iterrows():
-        non_empty = [cell.strip() for cell in row if cell.strip()]
-        if len(non_empty) >= 2:
-            text_line = " | ".join(non_empty)
-            if len(text_line) > 15:
-                all_text.append(text_line)
-    return "\n".join(all_text)
-def extract_text_from_pdf(path: str) -> str:
-    import logging
-    logging.getLogger("pdfminer").setLevel(logging.ERROR)
-    all_text = []
-    try:
-        with pdfplumber.open(path) as pdf:
-            for page in pdf.pages:
-                text = page.extract_text()
-                if text:
-                    all_text.append(text.strip())
-    except Exception:
-        return ""
-    return "\n".join(all_text)
-def extract_text(file_path: str) -> str:
-    if file_path.endswith(".xlsx"):
-        return extract_text_from_excel(file_path)
-    elif file_path.endswith(".csv"):
-        return extract_text_from_csv(file_path)
-    elif file_path.endswith(".pdf"):
-        return extract_text_from_pdf(file_path)
-    else:
-        return ""
-def split_text(text: str, max_tokens=MAX_CHUNK_TOKENS) -> List[str]:
-    effective_limit = max_tokens - PROMPT_OVERHEAD
-    chunks, current, current_tokens = [], [], 0
-    for line in text.split("\n"):
-        tokens = estimate_tokens(line)
-        if current_tokens + tokens > effective_limit:
-            if current:
-                chunks.append("\n".join(current))
-            current, current_tokens = [line], tokens
-        else:
-            current.append(line)
-            current_tokens += tokens
-    if current:
-        chunks.append("\n".join(current))
-    return chunks
-def batch_chunks(chunks: List[str], batch_size: int = BATCH_SIZE) -> List[List[str]]:
-    return [chunks[i:i+batch_size] for i in range(0, len(chunks), batch_size)]
-def build_prompt(chunk: str) -> str:
-    return f"""### Unstructured Clinical Records\n\nAnalyze the clinical notes below and summarize with:\n- Diagnostic Patterns\n- Medication Issues\n- Missed Opportunities\n- Inconsistencies\n- Follow-up Recommendations\n\n---\n\n{chunk}\n\n---\nRespond concisely in bullet points with clinical reasoning."""
 def init_agent() -> TxAgent:
     tool_path = os.path.join(tool_cache_dir, "new_tool.json")
     if not os.path.exists(tool_path):
         shutil.copy(os.path.abspath("data/new_tool.json"), tool_path)
     agent = TxAgent(
         model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
         rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
@@ -175,229 +95,63 @@ def init_agent() -> TxAgent:
     agent.init_model()
     return agent
-def analyze_batches(agent, batches: List[List[str]]) -> List[str]:
-    results = []
-    for batch in batches:
-        prompt = "\n\n".join(build_prompt(chunk) for chunk in batch)
-        try:
-            batch_response = ""
-            for r in agent.run_gradio_chat(
-                message=prompt,
-                history=[],
-                temperature=0.0,
-                max_new_tokens=MAX_NEW_TOKENS,
-                max_token=MAX_MODEL_TOKENS,
-                call_agent=False,
-                conversation=[]
-            ):
-                if isinstance(r, str):
-                    batch_response += r
-                elif isinstance(r, list):
-                    for m in r:
-                        if hasattr(m, "content"):
-                            batch_response += m.content
-                elif hasattr(r, "content"):
-                    batch_response += r.content
-            results.append(clean_response(batch_response))
-            time.sleep(SAFE_SLEEP)
-        except Exception as e:
-            results.append(f"❌ Batch failed: {str(e)}")
-            time.sleep(SAFE_SLEEP * 2)
-    torch.cuda.empty_cache()
-    gc.collect()
-    return results
-def generate_final_summary(agent, combined: str) -> str:
-    combined = remove_duplicate_paragraphs(combined)
-    final_prompt = f"""
-You are an expert clinical summarizer. Analyze the following summaries carefully and generate a **single final concise structured medical report**, avoiding any repetition or redundancy.
-Summaries:
-{combined}
-Respond with:
-- Diagnostic Patterns
-- Medication Issues
-- Missed Opportunities
-- Inconsistencies
-- Follow-up Recommendations
-Avoid repeating the same points multiple times.
-""".strip()
-    final_response = ""
-    for r in agent.run_gradio_chat(
-        message=final_prompt,
-        history=[],
-        temperature=0.0,
-        max_new_tokens=MAX_NEW_TOKENS,
-        max_token=MAX_MODEL_TOKENS,
-        call_agent=False,
-        conversation=[]
-    ):
-        if isinstance(r, str):
-            final_response += r
-        elif isinstance(r, list):
-            for m in r:
-                if hasattr(m, "content"):
-                    final_response += m.content
-        elif hasattr(r, "content"):
-            final_response += r.content
-    final_response = clean_response(final_response)
-    final_response = remove_duplicate_paragraphs(final_response)
-    return final_response
-def remove_non_ascii(text):
-    return ''.join(c for c in text if ord(c) < 256)
-def generate_pdf_report_with_charts(summary: str, report_path: str, detailed_batches: List[str] = None):
-    chart_dir = os.path.join(os.path.dirname(report_path), "charts")
-    os.makedirs(chart_dir, exist_ok=True)
-    # Prepare static data
-    categories = ['Diagnostics', 'Medications', 'Missed', 'Inconsistencies', 'Follow-up']
-    values = [4, 2, 3, 1, 5]
-    # === Static Charts ===
-    chart_paths = []
-    def save_chart(fig_func, filename):
-        path = os.path.join(chart_dir, filename)
-        fig_func()
-        plt.tight_layout()
-        plt.savefig(path)
-        plt.close()
-        chart_paths.append((filename.split('.')[0].replace('_', ' ').title(), path))
-    save_chart(lambda: plt.bar(categories, values), "bar_chart.png")
-    save_chart(lambda: plt.pie(values, labels=categories, autopct='%1.1f%%'), "pie_chart.png")
-    save_chart(lambda: plt.plot(categories, values, marker='o'), "trend_chart.png")
-    save_chart(lambda: plt.barh(categories, values), "horizontal_bar_chart.png")
-    # Radar chart
-    import numpy as np
-    labels = np.array(categories)
-    stats = np.array(values)
-    angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
-    stats = np.concatenate((stats, [stats[0]]))
-    angles += angles[:1]
-    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
-    ax.plot(angles, stats, marker='o')
-    ax.fill(angles, stats, alpha=0.25)
-    ax.set_yticklabels([])
-    ax.set_xticks(angles[:-1])
-    ax.set_xticklabels(labels)
-    ax.set_title('Radar Chart: Clinical Focus')
-    radar_path = os.path.join(chart_dir, "radar_chart.png")
-    plt.tight_layout()
-    plt.savefig(radar_path)
-    plt.close()
-    chart_paths.append(("Radar Chart: Clinical Focus", radar_path))
-    # === Dynamic Chart: Drug Frequency ===
-    drug_counter = {}
-    if detailed_batches:
-        for batch in detailed_batches:
-            lines = batch.split("\n")
-            for line in lines:
-                match = re.search(r"(?i)medication[s]?:\s*(.+)", line)
-                if match:
-                    items = re.split(r"[,;]", match.group(1))
-                    for item in items:
-                        drug = item.strip().title()
-                        if len(drug) > 2:
-                            drug_counter[drug] = drug_counter.get(drug, 0) + 1
-    if drug_counter:
-        drugs, freqs = zip(*sorted(drug_counter.items(), key=lambda x: x[1], reverse=True)[:10])
-        plt.figure(figsize=(6, 4))
-        plt.bar(drugs, freqs)
-        plt.xticks(rotation=45, ha='right')
-        plt.title('Top Medications Frequency')
-        drug_chart_path = os.path.join(chart_dir, "drug_frequency_chart.png")
-        plt.tight_layout()
-        plt.savefig(drug_chart_path)
-        plt.close()
-        chart_paths.append(("Top Medications Frequency", drug_chart_path))
-    # === PDF ===
-    pdf_path = report_path.replace('.md', '.pdf')
-    pdf = FPDF()
-    pdf.set_auto_page_break(auto=True, margin=20)
-    def add_section_title(pdf, title):
-        pdf.set_fill_color(230, 230, 230)
-        pdf.set_font("Arial", 'B', 14)
-        pdf.cell(0, 10, remove_non_ascii(title), ln=True, fill=True)
-        pdf.ln(3)
-    def add_footer(pdf):
-        pdf.set_y(-15)
-        pdf.set_font('Arial', 'I', 8)
-        pdf.set_text_color(150, 150, 150)
-        pdf.cell(0, 10, f"Page {pdf.page_no()}", align='C')
-    # Title Page
-    pdf.add_page()
-    pdf.set_font("Arial", 'B', 26)
-    pdf.set_text_color(0, 70, 140)
-    pdf.cell(0, 20, remove_non_ascii("Final Medical Report"), ln=True, align='C')
-    pdf.set_text_color(0, 0, 0)
-    pdf.set_font("Arial", '', 13)
-    pdf.cell(0, 10, datetime.now().strftime("Generated on %B %d, %Y at %H:%M"), ln=True, align='C')
-    pdf.ln(15)
-    pdf.set_font("Arial", '', 11)
-    pdf.set_fill_color(245, 245, 245)
-    pdf.multi_cell(0, 9, remove_non_ascii(
-        "This report contains a professional summary of clinical observations, potential inconsistencies, and follow-up recommendations based on the uploaded medical document."
-    ), border=1, fill=True, align="J")
-    add_footer(pdf)
-    # Final Summary
-    pdf.add_page()
-    add_section_title(pdf, "Final Summary")
-    pdf.set_font("Arial", '', 11)
-    for line in summary.split("\n"):
-        clean_line = remove_non_ascii(line.strip())
-        if clean_line:
-            pdf.multi_cell(0, 8, txt=clean_line)
-    add_footer(pdf)
-    # Charts Section
-    pdf.add_page()
-    add_section_title(pdf, "Statistical Overview")
-    for title, path in chart_paths:
-        pdf.set_font("Arial", 'B', 12)
-        pdf.cell(0, 9, remove_non_ascii(title), ln=True)
-        pdf.image(path, w=170)
-        pdf.ln(6)
-    add_footer(pdf)
-    # Detailed Tool Outputs
-    if detailed_batches:
-        pdf.add_page()
-        add_section_title(pdf, "Detailed Tool Insights")
-        for idx, detail in enumerate(detailed_batches):
-            pdf.set_font("Arial", 'B', 12)
-            pdf.cell(0, 9, remove_non_ascii(f"Tool Output #{idx + 1}"), ln=True)
-            pdf.set_font("Arial", '', 11)
-            for line in remove_non_ascii(detail).split("\n"):
-                pdf.multi_cell(0, 8, txt=line.strip())
-            pdf.ln(3)
-        add_footer(pdf)
-    pdf.output(pdf_path)
-    return pdf_path
-@app.post("/analyze", summary="Analyze medical document", response_description="Returns analysis results")
 async def analyze_document(file: UploadFile = File(...)):
-    """
-    Analyze a medical document (PDF, Excel, or CSV) and return a structured analysis.
-    Args:
-        file: The medical document to analyze (PDF, Excel, or CSV format)
-    Returns:
-        JSONResponse: Contains analysis results and report download path
-    """
     start_time = time.time()
     try:
@@ -413,50 +167,40 @@ async def analyze_document(file: UploadFile = File(...)):
         chunks = split_text(extracted)
         batches = batch_chunks(chunks, batch_size=BATCH_SIZE)
         batch_results = analyze_batches(agent, batches)
-        all_tool_outputs = batch_results.copy()
-        valid = [res for res in batch_results if not res.startswith("❌")]
-        if not valid:
             raise HTTPException(status_code=400, detail="No valid analysis results were generated")
-        summary = generate_final_summary(agent, "\n\n".join(valid))
         # Generate report files
         report_filename = f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         report_path = os.path.join(report_dir, f"{report_filename}.md")
         with open(report_path, 'w', encoding='utf-8') as f:
-            f.write(f"# Final Medical Report\n\n{summary}")
-        pdf_path = generate_pdf_report_with_charts(summary, report_path, detailed_batches=all_tool_outputs)
-        end_time = time.time()
-        elapsed_time = end_time - start_time
         # Clean up temp file
         os.remove(temp_path)
         return JSONResponse({
             "status": "success",
-            "summary": summary,
             "report_path": f"/reports/{os.path.basename(pdf_path)}",
-            "processing_time": f"{elapsed_time:.2f} seconds",
-            "detailed_outputs": all_tool_outputs
         })
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-@app.get("/reports/{filename}", response_class=FileResponse)
 async def download_report(filename: str):
-    """
-    Download a generated report PDF file.
-    Args:
-        filename: The name of the report file to download
-    Returns:
-        FileResponse: The PDF file for download
-    """
     file_path = os.path.join(report_dir, filename)
     if not os.path.exists(file_path):
         raise HTTPException(status_code=404, detail="Report not found")
@@ -464,20 +208,14 @@ async def download_report(filename: str):
 @app.get("/status")
 async def service_status():
-    """
-    Check the service status and version information.
-    Returns:
-        JSONResponse: Service status information
-    """
-    return JSONResponse({
         "status": "running",
         "version": "1.0.0",
         "model": "mims-harvard/TxAgent-T1-Llama-3.1-8B",
-        "rag_model": "mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
         "max_tokens": MAX_MODEL_TOKENS,
         "supported_file_types": [".pdf", ".xlsx", ".csv"]
-    })
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

+import sys
 import os
 import json
 import shutil
 file_cache_dir = os.path.join(persistent_dir, "cache")
 report_dir = os.path.join(persistent_dir, "reports")
+# Create directories if they don't exist
 for d in [model_cache_dir, tool_cache_dir, file_cache_dir, report_dir]:
     os.makedirs(d, exist_ok=True)
+# Set environment variables
 os.environ["HF_HOME"] = model_cache_dir
 os.environ["TRANSFORMERS_CACHE"] = model_cache_dir
+os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib"  # Fix for matplotlib permission issues
+# Set up Python path
 current_dir = os.path.dirname(os.path.abspath(__file__))
 src_path = os.path.abspath(os.path.join(current_dir, "src"))
 sys.path.insert(0, src_path)
+# Import TxAgent after setting up paths
 from txagent.txagent import TxAgent
+# Constants
 MAX_MODEL_TOKENS = 131072
 MAX_NEW_TOKENS = 4096
 MAX_CHUNK_TOKENS = 8192
 PROMPT_OVERHEAD = 300
 SAFE_SLEEP = 0.5
+# Initialize FastAPI app
+app = FastAPI(
+    title="Clinical Patient Support System API",
+    description="API for analyzing and summarizing unstructured medical files",
+    version="1.0.0"
+)
 # CORS configuration
 app.add_middleware(
 @app.on_event("startup")
 async def startup_event():
     global agent
     try:
+        agent = init_agent()
+    except Exception as e:
+        raise RuntimeError(f"Failed to initialize agent: {str(e)}")
 def init_agent() -> TxAgent:
+    """Initialize and return the TxAgent instance."""
     tool_path = os.path.join(tool_cache_dir, "new_tool.json")
     if not os.path.exists(tool_path):
         shutil.copy(os.path.abspath("data/new_tool.json"), tool_path)
     agent = TxAgent(
         model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
         rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
     agent.init_model()
     return agent
+# Utility functions (keep your existing functions but add error handling)
+def estimate_tokens(text: str) -> int:
+    """Estimate the number of tokens in the given text."""
+    return len(text) // 4 + 1
+def clean_response(text: str) -> str:
+    """Clean and format the response text."""
+    if not text:
+        return ""
+    text = re.sub(r"\[.*?\]|\bNone\b", "", text, flags=re.DOTALL)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+def extract_text_from_excel(path: str) -> str:
+    """Extract text from Excel file."""
+    try:
+        all_text = []
+        xls = pd.ExcelFile(path)
+        for sheet_name in xls.sheet_names:
+            try:
+                df = xls.parse(sheet_name).astype(str).fillna("")
+            except Exception:
+                continue
+            for _, row in df.iterrows():
+                non_empty = [cell.strip() for cell in row if cell.strip()]
+                if len(non_empty) >= 2:
+                    text_line = " | ".join(non_empty)
+                    if len(text_line) > 15:
+                        all_text.append(f"[{sheet_name}] {text_line}")
+        return "\n".join(all_text)
+    except Exception as e:
+        raise RuntimeError(f"Failed to extract text from Excel: {str(e)}")
+def extract_text(file_path: str) -> str:
+    """Extract text from supported file types."""
+    try:
+        if file_path.endswith(".xlsx"):
+            return extract_text_from_excel(file_path)
+        elif file_path.endswith(".csv"):
+            df = pd.read_csv(file_path).astype(str).fillna("")
+            return "\n".join(
+                " | ".join(cell.strip() for cell in row if cell.strip())
+                for _, row in df.iterrows()
+                if len([cell for cell in row if cell.strip()]) >= 2
+            )
+        elif file_path.endswith(".pdf"):
+            with pdfplumber.open(file_path) as pdf:
+                return "\n".join(page.extract_text() or "" for page in pdf.pages)
+        else:
+            return ""
+    except Exception as e:
+        raise RuntimeError(f"Failed to extract text from file: {str(e)}")
+# API endpoints
+@app.post("/analyze")
 async def analyze_document(file: UploadFile = File(...)):
+    """Analyze a medical document and return results."""
     start_time = time.time()
     try:
         chunks = split_text(extracted)
         batches = batch_chunks(chunks, batch_size=BATCH_SIZE)
         batch_results = analyze_batches(agent, batches)
+        valid_results = [res for res in batch_results if not res.startswith("❌")]
+        if not valid_results:
             raise HTTPException(status_code=400, detail="No valid analysis results were generated")
+        final_summary = generate_final_summary(agent, "\n\n".join(valid_results))
         # Generate report files
         report_filename = f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         report_path = os.path.join(report_dir, f"{report_filename}.md")
         with open(report_path, 'w', encoding='utf-8') as f:
+            f.write(f"# Final Medical Report\n\n{final_summary}")
+        pdf_path = generate_pdf_report_with_charts(final_summary, report_path, detailed_batches=batch_results)
         # Clean up temp file
         os.remove(temp_path)
         return JSONResponse({
             "status": "success",
+            "summary": final_summary,
             "report_path": f"/reports/{os.path.basename(pdf_path)}",
+            "processing_time": f"{time.time() - start_time:.2f} seconds",
+            "detailed_outputs": batch_results
         })
+    except HTTPException:
+        raise
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+@app.get("/reports/{filename}")
 async def download_report(filename: str):
+    """Download a generated report."""
     file_path = os.path.join(report_dir, filename)
     if not os.path.exists(file_path):
         raise HTTPException(status_code=404, detail="Report not found")
 @app.get("/status")
 async def service_status():
+    """Check service status."""
+    return {
         "status": "running",
         "version": "1.0.0",
         "model": "mims-harvard/TxAgent-T1-Llama-3.1-8B",
         "max_tokens": MAX_MODEL_TOKENS,
         "supported_file_types": [".pdf", ".xlsx", ".csv"]
+    }
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)