CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 18

Commit

fc9566f

verified ·

1 Parent(s): 4044b30

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -38

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import sys
 import os
-import pandas as pd
 import pdfplumber
 import json
 import gradio as gr
@@ -14,8 +14,10 @@ import subprocess
 import logging
 import torch
 import gc
-from diskcache import Cache
 import time
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -47,7 +49,7 @@ sys.path.insert(0, src_path)
 from txagent.txagent import TxAgent
 # Initialize cache with 10GB limit
-cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
 def sanitize_utf8(text: str) -> str:
     return text.encode("utf-8", "ignore").decode("utf-8")
@@ -91,10 +93,10 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
-def convert_file_to_json(file_path: str, file_type: str, progress_callback=None) -> str:
-    try:
-        file_h = file_hash(file_path)
-        cache_key = f"{file_h}_{file_type}"
         if cache_key in cache:
             return cache[cache_key]
@@ -102,17 +104,23 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
             text = extract_all_pages(file_path, progress_callback)
             result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
         elif file_type == "csv":
-            df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
-                             skip_blank_lines=False, on_bad_lines="skip")
-            content = df.fillna("").astype(str).values.tolist()
             result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         elif file_type in ["xls", "xlsx"]:
-            try:
-                df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
-            except Exception:
-                df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
-            content = df.fillna("").astype(str).values.tolist()
-            result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         else:
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
@@ -139,9 +147,7 @@ def log_system_usage(tag=""):
 def clean_response(text: str) -> str:
     text = sanitize_utf8(text)
-    # Remove unwanted patterns and tool call artifacts
     text = re.sub(r"\[.*?\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
-    # Extract only missed diagnoses, ignoring other categories
     diagnoses = []
     lines = text.splitlines()
     in_diagnoses_section = False
@@ -159,22 +165,18 @@ def clean_response(text: str) -> str:
             diagnosis = re.sub(r"^\-\s*", "", line).strip()
             if diagnosis and not re.match(r"No issues identified", diagnosis, re.IGNORECASE):
                 diagnoses.append(diagnosis)
-    # Join diagnoses into a plain text paragraph
     text = " ".join(diagnoses)
-    # Clean up extra whitespace and punctuation
     text = re.sub(r"\s+", " ", text).strip()
     text = re.sub(r"[^\w\s\.\,\(\)\-]", "", text)
     return text if text else ""
 def summarize_findings(combined_response: str) -> str:
-    # Split response by chunk analyses
     chunks = combined_response.split("--- Analysis for Chunk")
     diagnoses = []
     for chunk in chunks:
         chunk = chunk.strip()
         if not chunk or "No oversights identified" in chunk:
             continue
-        # Extract missed diagnoses from chunk
         lines = chunk.splitlines()
         in_diagnoses_section = False
         for line in lines:
@@ -191,22 +193,16 @@ def summarize_findings(combined_response: str) -> str:
                 diagnosis = re.sub(r"^\-\s*", "", line).strip()
                 if diagnosis and not re.match(r"No issues identified", diagnosis, re.IGNORECASE):
                     diagnoses.append(diagnosis)
-    # Remove duplicates while preserving order
     seen = set()
     unique_diagnoses = [d for d in diagnoses if not (d in seen or seen.add(d))]
     if not unique_diagnoses:
         return "No missed diagnoses were identified in the provided records."
-    # Combine into a single paragraph
     summary = "Missed diagnoses include " + ", ".join(unique_diagnoses[:-1])
     if len(unique_diagnoses) > 1:
         summary += f", and {unique_diagnoses[-1]}"
     elif len(unique_diagnoses) == 1:
         summary = "Missed diagnoses include " + unique_diagnoses[0]
     summary += ", all of which require urgent clinical review to prevent potential adverse outcomes."
     return summary.strip()
 def init_agent():
@@ -232,7 +228,7 @@ def init_agent():
     logger.info("Agent Ready")
     return agent
-def create_ui(agent):
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
         chatbot = gr.Chatbot(label="Detailed Analysis", height=600, type="messages")
@@ -249,7 +245,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
 {chunk}
 """
-        def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
             history.append({"role": "user", "content": message})
             yield history, None, ""
@@ -260,11 +256,10 @@ Patient Record Excerpt (Chunk {0} of {1}):
                     progress(current / total, desc=f"Extracting text... Page {current}/{total}")
                     return history, None, ""
-                with ThreadPoolExecutor(max_workers=6) as executor:
-                    futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
-                    results = [sanitize_utf8(f.result()) for f in as_completed(futures)]
-                    extracted = "\n".join(results)
-                    file_hash_value = file_hash(files[0].name) if files else ""
                 history.append({"role": "assistant", "content": "✅ Text extraction complete."})
                 yield history, None, ""
@@ -319,8 +314,8 @@ Patient Record Excerpt (Chunk {0} of {1}):
                 summary = summarize_findings(combined_response)
                 report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
                 if report_path:
-                    with open(report_path, "w", encoding="utf-8") as f:
-                        f.write(combined_response + "\n\n" + summary)
                 yield history, report_path if report_path and os.path.exists(report_path) else None, summary
             except Exception as e:
@@ -336,7 +331,7 @@ if __name__ == "__main__":
     try:
         logger.info("Launching app...")
         agent = init_agent()
-        demo = create_ui(agent)
         demo.queue(api_open=False).launch(
             server_name="0.0.0.0",
             server_port=7860,

 import sys
 import os
+import polars as pl
 import pdfplumber
 import json
 import gradio as gr
 import logging
 import torch
 import gc
+from cachetools import LFUCache
 import time
+import asyncio
+import aiofiles
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 from txagent.txagent import TxAgent
 # Initialize cache with 10GB limit
+cache = LFUCache(maxsize=1000)  # Adjust maxsize based on memory constraints
 def sanitize_utf8(text: str) -> str:
     return text.encode("utf-8", "ignore").decode("utf-8")
         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
+async def convert_file_to_json(file_path: str, file_type: str, progress_callback=None) -> str:
+    try.:
+        cache_key = f"{os.path.basename(file_path)}_{file_type}"
         if cache_key in cache:
             return cache[cache_key]
             text = extract_all_pages(file_path, progress_callback)
             result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
         elif file_type == "csv":
+            df = pl.read_csv(file_path, encoding="utf8-lossy", has_header=False, infer_schema_length=0)
+            content = df.fill_null("").to_dicts()
             result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         elif file_type in ["xls", "xlsx"]:
+            async def stream_excel_to_json():
+                df = pl.read_excel(file_path, read_csv_options={"infer_schema_length": 0})
+                chunk_size = 1000
+                rows = []
+                for i in range(0, len(df), chunk_size):
+                    chunk = df[i:i + chunk_size].fill_null("").to_dicts()
+                    rows.extend(chunk)
+                    if progress_callback:
+                        progress_callback(min(i + chunk_size, len(df)), len(df))
+                    await asyncio.sleep(0)  # Yield control to event loop
+                return json.dumps({"filename": os.path.basename(file_path), "rows": rows})
+            result = await stream_excel_to_json()
         else:
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
 def clean_response(text: str) -> str:
     text = sanitize_utf8(text)
     text = re.sub(r"\[.*?\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
     diagnoses = []
     lines = text.splitlines()
     in_diagnoses_section = False
             diagnosis = re.sub(r"^\-\s*", "", line).strip()
             if diagnosis and not re.match(r"No issues identified", diagnosis, re.IGNORECASE):
                 diagnoses.append(diagnosis)
     text = " ".join(diagnoses)
     text = re.sub(r"\s+", " ", text).strip()
     text = re.sub(r"[^\w\s\.\,\(\)\-]", "", text)
     return text if text else ""
 def summarize_findings(combined_response: str) -> str:
     chunks = combined_response.split("--- Analysis for Chunk")
     diagnoses = []
     for chunk in chunks:
         chunk = chunk.strip()
         if not chunk or "No oversights identified" in chunk:
             continue
         lines = chunk.splitlines()
         in_diagnoses_section = False
         for line in lines:
                 diagnosis = re.sub(r"^\-\s*", "", line).strip()
                 if diagnosis and not re.match(r"No issues identified", diagnosis, re.IGNORECASE):
                     diagnoses.append(diagnosis)
     seen = set()
     unique_diagnoses = [d for d in diagnoses if not (d in seen or seen.add(d))]
     if not unique_diagnoses:
         return "No missed diagnoses were identified in the provided records."
     summary = "Missed diagnoses include " + ", ".join(unique_diagnoses[:-1])
     if len(unique_diagnoses) > 1:
         summary += f", and {unique_diagnoses[-1]}"
     elif len(unique_diagnoses) == 1:
         summary = "Missed diagnoses include " + unique_diagnoses[0]
     summary += ", all of which require urgent clinical review to prevent potential adverse outcomes."
     return summary.strip()
 def init_agent():
     logger.info("Agent Ready")
     return agent
+async def create_ui(agent):
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
         chatbot = gr.Chatbot(label="Detailed Analysis", height=600, type="messages")
 {chunk}
 """
+        async def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
             history.append({"role": "user", "content": message})
             yield history, None, ""
                     progress(current / total, desc=f"Extracting text... Page {current}/{total}")
                     return history, None, ""
+                tasks = [convert_file_to_json(f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
+                results = await asyncio.gather(*tasks, return_exceptions=True)
+                extracted = "\n".join([sanitize_utf8(r) for r in results if isinstance(r, str)])
+                file_hash_value = file_hash(files[0].name) if files else ""
                 history.append({"role": "assistant", "content": "✅ Text extraction complete."})
                 yield history, None, ""
                 summary = summarize_findings(combined_response)
                 report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
                 if report_path:
+                    async with aiofiles.open(report_path, "w", encoding="utf-8") as f:
+                        await f.write(combined_response + "\n\n" + summary)
                 yield history, report_path if report_path and os.path.exists(report_path) else None, summary
             except Exception as e:
     try:
         logger.info("Launching app...")
         agent = init_agent()
+        demo = asyncio.run(create_ui(agent))
         demo.queue(api_open=False).launch(
             server_name="0.0.0.0",
             server_port=7860,