CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 18

Commit

be8f191

verified ·

1 Parent(s): 92c6be9

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -35

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import sys
 import os
-import polars as pl
 import pdfplumber
 import json
 import gradio as gr
-from typing import List
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import hashlib
 import shutil
@@ -14,10 +14,12 @@ import subprocess
 import logging
 import torch
 import gc
-from cachetools import LFUCache
 import time
-import asyncio
-import aiofiles
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -49,7 +51,7 @@ sys.path.insert(0, src_path)
 from txagent.txagent import TxAgent
 # Initialize cache with 10GB limit
-cache = LFUCache(maxsize=1000)  # Adjust maxsize based on memory constraints
 def sanitize_utf8(text: str) -> str:
     return text.encode("utf-8", "ignore").decode("utf-8")
@@ -93,33 +95,79 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
-async def convert_file_to_json(file_path: str, file_type: str, progress_callback=None) -> str:
     try:
-        cache_key = f"{os.path.basename(file_path)}_{file_type}"
         if cache_key in cache:
             return cache[cache_key]
         if file_type == "pdf":
             text = extract_all_pages(file_path, progress_callback)
             result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
-        elif file_type == "csv":
-            df = pl.read_csv(file_path, encoding="utf8-lossy", has_header=False, infer_schema_length=0)
-            content = df.fill_null("").to_dicts()
-            result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
-        elif file_type in ["xls", "xlsx"]:
-            async def stream_excel_to_json():
-                df = pl.read_excel(file_path, read_csv_options={"infer_schema_length": 0})
-                chunk_size = 1000
-                rows = []
-                for i in range(0, len(df), chunk_size):
-                    chunk = df[i:i + chunk_size].fill_null("").to_dicts()
-                    rows.extend(chunk)
-                    if progress_callback:
-                        progress_callback(min(i + chunk_size, len(df)), len(df))
-                    await asyncio.sleep(0)  # Yield control to event loop
-                return json.dumps({"filename": os.path.basename(file_path), "rows": rows})
-            result = await stream_excel_to_json()
         else:
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
@@ -146,7 +194,9 @@ def log_system_usage(tag=""):
 def clean_response(text: str) -> str:
     text = sanitize_utf8(text)
     text = re.sub(r"\[.*?\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
     diagnoses = []
     lines = text.splitlines()
     in_diagnoses_section = False
@@ -164,18 +214,22 @@ def clean_response(text: str) -> str:
             diagnosis = re.sub(r"^\-\s*", "", line).strip()
             if diagnosis and not re.match(r"No issues identified", diagnosis, re.IGNORECASE):
                 diagnoses.append(diagnosis)
     text = " ".join(diagnoses)
     text = re.sub(r"\s+", " ", text).strip()
     text = re.sub(r"[^\w\s\.\,\(\)\-]", "", text)
     return text if text else ""
 def summarize_findings(combined_response: str) -> str:
     chunks = combined_response.split("--- Analysis for Chunk")
     diagnoses = []
     for chunk in chunks:
         chunk = chunk.strip()
         if not chunk or "No oversights identified" in chunk:
             continue
         lines = chunk.splitlines()
         in_diagnoses_section = False
         for line in lines:
@@ -192,16 +246,22 @@ def summarize_findings(combined_response: str) -> str:
                 diagnosis = re.sub(r"^\-\s*", "", line).strip()
                 if diagnosis and not re.match(r"No issues identified", diagnosis, re.IGNORECASE):
                     diagnoses.append(diagnosis)
     seen = set()
     unique_diagnoses = [d for d in diagnoses if not (d in seen or seen.add(d))]
     if not unique_diagnoses:
         return "No missed diagnoses were identified in the provided records."
     summary = "Missed diagnoses include " + ", ".join(unique_diagnoses[:-1])
     if len(unique_diagnoses) > 1:
         summary += f", and {unique_diagnoses[-1]}"
     elif len(unique_diagnoses) == 1:
         summary = "Missed diagnoses include " + unique_diagnoses[0]
     summary += ", all of which require urgent clinical review to prevent potential adverse outcomes."
     return summary.strip()
 def init_agent():
@@ -227,7 +287,7 @@ def init_agent():
     logger.info("Agent Ready")
     return agent
-async def create_ui(agent):
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
         chatbot = gr.Chatbot(label="Detailed Analysis", height=600, type="messages")
@@ -244,7 +304,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
 {chunk}
 """
-        async def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
             history.append({"role": "user", "content": message})
             yield history, None, ""
@@ -255,10 +315,11 @@ Patient Record Excerpt (Chunk {0} of {1}):
                     progress(current / total, desc=f"Extracting text... Page {current}/{total}")
                     return history, None, ""
-                tasks = [convert_file_to_json(f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
-                results = await asyncio.gather(*tasks, return_exceptions=True)
-                extracted = "\n".join([sanitize_utf8(r) for r in results if isinstance(r, str)])
-                file_hash_value = file_hash(files[0].name) if files else ""
                 history.append({"role": "assistant", "content": "✅ Text extraction complete."})
                 yield history, None, ""
@@ -313,8 +374,8 @@ Patient Record Excerpt (Chunk {0} of {1}):
                 summary = summarize_findings(combined_response)
                 report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
                 if report_path:
-                    async with aiofiles.open(report_path, "w", encoding="utf-8") as f:
-                        await f.write(combined_response + "\n\n" + summary)
                 yield history, report_path if report_path and os.path.exists(report_path) else None, summary
             except Exception as e:
@@ -330,7 +391,7 @@ if __name__ == "__main__":
     try:
         logger.info("Launching app...")
         agent = init_agent()
-        demo = asyncio.run(create_ui(agent))
         demo.queue(api_open=False).launch(
             server_name="0.0.0.0",
             server_port=7860,

 import sys
 import os
+import pandas as pd
 import pdfplumber
 import json
 import gradio as gr
+from typing import List, Tuple, Optional
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import hashlib
 import shutil
 import logging
 import torch
 import gc
+from diskcache import Cache
 import time
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pyarrow.csv as pc
+import numpy as np
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 from txagent.txagent import TxAgent
 # Initialize cache with 10GB limit
+cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
 def sanitize_utf8(text: str) -> str:
     return text.encode("utf-8", "ignore").decode("utf-8")
         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
+def excel_to_arrow(file_path: str) -> pa.Table:
+    """Convert Excel file to Arrow table for faster processing"""
     try:
+        # First try with openpyxl (faster for xlsx)
+        try:
+            df = pd.read_excel(file_path, engine='openpyxl', header=None, dtype=str)
+        except Exception:
+            # Fall back to xlrd if needed
+            df = pd.read_excel(file_path, engine='xlrd', header=None, dtype=str)
+        # Convert to Arrow table
+        table = pa.Table.from_pandas(df.fillna(""))
+        return table
+    except Exception as e:
+        logger.error(f"Error converting Excel to Arrow: {e}")
+        raise
+def csv_to_arrow(file_path: str) -> pa.Table:
+    """Convert CSV file to Arrow table for faster processing"""
+    try:
+        read_options = pc.ReadOptions(
+            encoding='utf-8',
+            invalid_row_handler=lambda x: None,
+            column_names=[str(i) for i in range(1000)]  # Generous column count
+        )
+        convert_options = pc.ConvertOptions(
+            strings_can_be_null=True,
+            quoted_strings_can_be_null=True,
+            include_columns=None
+        )
+        table = pc.read_csv(
+            file_path,
+            read_options=read_options,
+            convert_options=convert_options
+        )
+        return table
+    except Exception as e:
+        logger.error(f"Error converting CSV to Arrow: {e}")
+        raise
+def convert_file_to_json(file_path: str, file_type: str, progress_callback=None) -> str:
+    try:
+        file_h = file_hash(file_path)
+        cache_key = f"{file_h}_{file_type}"
         if cache_key in cache:
             return cache[cache_key]
         if file_type == "pdf":
             text = extract_all_pages(file_path, progress_callback)
             result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
+        elif file_type in ["csv", "xls", "xlsx"]:
+            # Use Arrow for tabular data processing
+            start_time = time.time()
+            if file_type == "csv":
+                table = csv_to_arrow(file_path)
+            else:  # Excel files
+                table = excel_to_arrow(file_path)
+            # Convert to list of lists efficiently
+            content = []
+            for col in table.columns:
+                content.append([str(x) if x is not None else "" for x in col.to_pylist()])
+            # Transpose to get rows
+            rows = list(map(list, zip(*content)))
+            logger.info(f"Processed {len(rows)} rows in {time.time()-start_time:.2f}s")
+            result = json.dumps({
+                "filename": os.path.basename(file_path),
+                "rows": rows,
+                "arrow_processed": True  # Flag for optimized processing
+            })
         else:
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
 def clean_response(text: str) -> str:
     text = sanitize_utf8(text)
+    # Remove unwanted patterns and tool call artifacts
     text = re.sub(r"\[.*?\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
+    # Extract only missed diagnoses, ignoring other categories
     diagnoses = []
     lines = text.splitlines()
     in_diagnoses_section = False
             diagnosis = re.sub(r"^\-\s*", "", line).strip()
             if diagnosis and not re.match(r"No issues identified", diagnosis, re.IGNORECASE):
                 diagnoses.append(diagnosis)
+    # Join diagnoses into a plain text paragraph
     text = " ".join(diagnoses)
+    # Clean up extra whitespace and punctuation
     text = re.sub(r"\s+", " ", text).strip()
     text = re.sub(r"[^\w\s\.\,\(\)\-]", "", text)
     return text if text else ""
 def summarize_findings(combined_response: str) -> str:
+    # Split response by chunk analyses
     chunks = combined_response.split("--- Analysis for Chunk")
     diagnoses = []
     for chunk in chunks:
         chunk = chunk.strip()
         if not chunk or "No oversights identified" in chunk:
             continue
+        # Extract missed diagnoses from chunk
         lines = chunk.splitlines()
         in_diagnoses_section = False
         for line in lines:
                 diagnosis = re.sub(r"^\-\s*", "", line).strip()
                 if diagnosis and not re.match(r"No issues identified", diagnosis, re.IGNORECASE):
                     diagnoses.append(diagnosis)
+    # Remove duplicates while preserving order
     seen = set()
     unique_diagnoses = [d for d in diagnoses if not (d in seen or seen.add(d))]
     if not unique_diagnoses:
         return "No missed diagnoses were identified in the provided records."
+    # Combine into a single paragraph
     summary = "Missed diagnoses include " + ", ".join(unique_diagnoses[:-1])
     if len(unique_diagnoses) > 1:
         summary += f", and {unique_diagnoses[-1]}"
     elif len(unique_diagnoses) == 1:
         summary = "Missed diagnoses include " + unique_diagnoses[0]
     summary += ", all of which require urgent clinical review to prevent potential adverse outcomes."
     return summary.strip()
 def init_agent():
     logger.info("Agent Ready")
     return agent
+def create_ui(agent):
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
         chatbot = gr.Chatbot(label="Detailed Analysis", height=600, type="messages")
 {chunk}
 """
+        def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
             history.append({"role": "user", "content": message})
             yield history, None, ""
                     progress(current / total, desc=f"Extracting text... Page {current}/{total}")
                     return history, None, ""
+                with ThreadPoolExecutor(max_workers=6) as executor:
+                    futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
+                    results = [sanitize_utf8(f.result()) for f in as_completed(futures)]
+                    extracted = "\n".join(results)
+                    file_hash_value = file_hash(files[0].name) if files else ""
                 history.append({"role": "assistant", "content": "✅ Text extraction complete."})
                 yield history, None, ""
                 summary = summarize_findings(combined_response)
                 report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
                 if report_path:
+                    with open(report_path, "w", encoding="utf-8") as f:
+                        f.write(combined_response + "\n\n" + summary)
                 yield history, report_path if report_path and os.path.exists(report_path) else None, summary
             except Exception as e:
     try:
         logger.info("Launching app...")
         agent = init_agent()
+        demo = create_ui(agent)
         demo.queue(api_open=False).launch(
             server_name="0.0.0.0",
             server_port=7860,