CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 17

Commit

eea533f

verified ·

1 Parent(s): 6358a36

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -122

app.py CHANGED Viewed

@@ -14,11 +14,6 @@ import subprocess
 import multiprocessing
 from functools import partial
 import time
-import logging
-# Setup logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", filename="/home/user/clinical_oversight_analyzer.log")
-logger = logging.getLogger(__name__)
 # Persistent directory
 persistent_dir = "/data/hf_cache"
@@ -34,12 +29,10 @@ for directory in [model_cache_dir, tool_cache_dir, file_cache_dir, report_dir, v
     os.makedirs(directory, exist_ok=True)
 os.environ["HF_HOME"] = model_cache_dir
 os.environ["VLLM_CACHE_DIR"] = vllm_cache_dir
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-# Remove TRANSFORMERS_CACHE to suppress warning
-if "TRANSFORMERS_CACHE" in os.environ:
-    del os.environ["TRANSFORMERS_CACHE"]
 current_dir = os.path.dirname(os.path.abspath(__file__))
 src_path = os.path.abspath(os.path.join(current_dir, "src"))
@@ -54,9 +47,6 @@ def file_hash(path: str) -> str:
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
-def batch_hash(chunks: List[str], prompt: str) -> str:
-    return hashlib.md5(("".join(chunks) + prompt).encode("utf-8")).hexdigest()
 def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
     """Extract text from a range of PDF pages."""
     try:
@@ -66,8 +56,7 @@ def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
                 page_text = page.extract_text() or ""
                 text_chunks.append(f"=== Page {start_page + pdf.pages.index(page) + 1} ===\n{page_text.strip()}")
         return "\n\n".join(text_chunks)
-    except Exception as e:
-        logger.error(f"Error extracting pages {start_page}-{end_page}: {e}")
         return ""
 def extract_all_pages(file_path: str, progress_callback=None) -> str:
@@ -79,14 +68,17 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
         if total_pages == 0:
             return ""
         num_processes = min(6, multiprocessing.cpu_count())
         pages_per_process = max(1, total_pages // num_processes)
         ranges = [(i * pages_per_process, min((i + 1) * pages_per_process, total_pages))
                   for i in range(num_processes)]
         if ranges[-1][1] != total_pages:
             ranges[-1] = (ranges[-1][0], total_pages)
         with multiprocessing.Pool(processes=num_processes) as pool:
             extract_func = partial(extract_page_range, file_path)
             results = []
@@ -98,7 +90,6 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
         return "\n\n".join(filter(None, results))
     except Exception as e:
-        logger.error(f"PDF processing error: {e}")
         return f"PDF processing error: {str(e)}"
 def convert_file_to_json(file_path: str, file_type: str, progress_callback=None) -> str:
@@ -130,61 +121,87 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
             f.write(result)
         return result
     except Exception as e:
-        logger.error(f"Error processing {file_path}: {e}")
         return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
 def log_system_usage(tag=""):
     try:
         cpu = psutil.cpu_percent(interval=1)
         mem = psutil.virtual_memory()
-        logger.info(f"[{tag}] CPU: {cpu}% | RAM: {mem.used // (1024**2)}MB / {mem.total // (1024**2)}MB")
         result = subprocess.run(
             ["nvidia-smi", "--query-gpu=memory.used,memory.total,utilization.gpu", "--format=csv,nounits,noheader"],
             capture_output=True, text=True
         )
         if result.returncode == 0:
             used, total, util = result.stdout.strip().split(", ")
-            logger.info(f"[{tag}] GPU: {used}MB / {total}MB | Utilization: {util}%")
     except Exception as e:
-        logger.error(f"[{tag}] GPU/CPU monitor failed: {e}")
 def clean_response(text: str) -> str:
-    """Clean TxAgent response to group findings by section without tool names."""
     text = sanitize_utf8(text)
-    # Remove tool tags, None, and reasoning
-    text = re.sub(r"\[TOOL:[^\]]+\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
     text = re.sub(r"\n{3,}", "\n\n", text)
-    text = re.sub(r"[^\n#\-\*\w\s\.\,\:\(\)]+", "", text)
     sections = {}
     current_section = None
     lines = text.splitlines()
     for line in lines:
         line = line.strip()
         if not line:
             continue
-        section_match = re.match(r"###\s*(Drugs|Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
         if section_match:
             current_section = section_match.group(1)
             if current_section not in sections:
                 sections[current_section] = []
             continue
         finding_match = re.match(r"-\s*.+", line)
         if finding_match and current_section and not re.match(r"-\s*No issues identified", line):
-            sections[current_section].append(line)
     cleaned = []
     for heading, findings in sections.items():
-        if findings:
             cleaned.append(f"### {heading}\n" + "\n".join(findings))
     text = "\n\n".join(cleaned).strip()
     if not text:
-        text = ""
     return text
 def init_agent():
-    logger.info("Initializing model...")
     log_system_usage("Before Load")
     default_tool_path = os.path.abspath("data/new_tool.json")
     target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
@@ -197,67 +214,15 @@ def init_agent():
         tool_files_dict={"new_tool": target_tool_path},
         force_finish=True,
         enable_checker=True,
-        step_rag_num=1,
         seed=100,
         additional_default_tools=[],
     )
     agent.init_model()
     log_system_usage("After Load")
-    logger.info("Agent Ready")
     return agent
-def process_batch(agent, chunks: List[str], cache_path: str, prompt_template: str) -> str:
-    """Process a batch of chunks in a single prompt."""
-    if not any(chunk.strip() for chunk in chunks):
-        logger.warning("All chunks are empty, skipping analysis...")
-        return "No oversights identified in the provided records."
-    batch_id = batch_hash(chunks, prompt_template)
-    batch_cache_path = os.path.join(cache_path, f"batch_{batch_id}.txt")
-    if os.path.exists(batch_cache_path):
-        with open(batch_cache_path, "r", encoding="utf-8") as f:
-            logger.info("Cache hit for batch")
-            return f.read()
-    # Combine chunks into one prompt
-    chunk_texts = [f"Chunk {i+1}:\n{chunk[:500]}" for i, chunk in enumerate(chunks) if chunk.strip()]
-    combined_text = "\n\n".join(chunk_texts)
-    prompt = prompt_template.format(chunks=combined_text)
-    response = ""
-    try:
-        for output in agent.run_gradio_chat(
-            message=prompt,
-            history=[],
-            temperature=0.2,
-            max_new_tokens=256,
-            max_token=1024,
-            call_agent=False,
-            conversation=[],
-        ):
-            if output is None:
-                continue
-            if isinstance(output, list):
-                for m in output:
-                    if hasattr(m, 'content') and m.content:
-                        cleaned = clean_response(m.content)
-                        if cleaned and re.search(r"###\s*\w+", cleaned):
-                            response += cleaned + "\n\n"
-            elif isinstance(output, str) and output.strip():
-                cleaned = clean_response(output)
-                if cleaned and re.search(r"###\s*\w+", cleaned):
-                    response += cleaned + "\n\n"
-    except Exception as e:
-        logger.error(f"Error processing batch: {e}")
-        return f"Error occurred: {str(e)}"
-    if response:
-        with open(batch_cache_path, "w", encoding="utf-8") as f:
-            f.write(response)
-        return response
-    return "No oversights identified in the provided records."
 def create_ui(agent):
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
@@ -267,27 +232,6 @@ def create_ui(agent):
         send_btn = gr.Button("Analyze", variant="primary")
         download_output = gr.File(label="Download Full Report")
-        prompt_template = """
-You are a medical analysis assistant. Analyze the following patient record excerpts for clinical oversights and provide a concise, evidence-based summary in markdown format. Group findings under the following headings: 'Drugs', 'Missed Diagnoses', 'Medication Conflicts', 'Incomplete Assessments', 'Urgent Follow-up'. For each finding, include:
-- Clinical context (why the issue was missed or relevant details from the record).
-- Potential risks if unaddressed (e.g., disease progression, adverse events).
-- Actionable recommendations (e.g., tests, referrals, medication adjustments).
-Output ONLY the markdown-formatted findings, with bullet points under each heading. Do NOT include tool references, reasoning, or intermediate steps. If no issues are found for a section, omit that section. Ensure the output is specific to the provided text and avoids generic responses.
-Example Output:
-### Drugs
-- Opioid use disorder not addressed. Missed due to lack of screening. Risks: overdose. Recommend: addiction specialist referral.
-### Missed Diagnoses
-- Elevated BP noted without diagnosis. Missed due to inconsistent visits. Risks: stroke. Recommend: BP monitoring, antihypertensives.
-### Incomplete Assessments
-- Chest pain not evaluated. Time constraints likely cause. Risks: cardiac issues. Recommend: ECG, stress test.
-### Urgent Follow-up
-- Abnormal creatinine not addressed. Delayed lab review. Risks: renal failure. Recommend: nephrology referral.
-Patient Record Excerpts:
-{chunks}
-"""
         def analyze(message: str, history: List[dict], files: List):
             history.append({"role": "user", "content": message})
             history.append({"role": "assistant", "content": "⏳ Extracting text from files..."})
@@ -296,6 +240,7 @@ Patient Record Excerpts:
             extracted = ""
             file_hash_value = ""
             if files:
                 total_pages = 0
                 processed_pages = 0
                 def update_extraction_progress(current, total):
@@ -312,36 +257,102 @@ Patient Record Excerpts:
                     extracted = "\n".join(results)
                     file_hash_value = file_hash(files[0].name) if files else ""
-            history.pop()
             history.append({"role": "assistant", "content": "✅ Text extraction complete."})
             yield history, None
-            chunk_size = 500  # Fixed for speed
-            max_chunks = 5    # Fixed for speed
             chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
-            chunks = chunks[:max_chunks]  # Limit to 5 chunks
-            if not chunks:
-                history.append({"role": "assistant", "content": "No content to analyze."})
-                yield history, None
-                return
-            try:
-                animation = ["🔍", "📊", "🧠", "🔎"][(int(time.time() * 2) % 4)]
-                history.append({"role": "assistant", "content": f"Analyzing chunks 1-5... {animation}"})
-                yield history, None
-                response = process_batch(agent, chunks, file_cache_dir, prompt_template)
-                history[-1] = {"role": "assistant", "content": response.strip()}
-                yield history, None
                 report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
-                if report_path and response.strip() and "No oversights identified" not in response and "Error occurred" not in response:
                     with open(report_path, "w", encoding="utf-8") as f:
-                        f.write(response)
                 yield history, report_path if report_path and os.path.exists(report_path) else None
             except Exception as e:
-                logger.error(f"Analysis error: {e}")
                 history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
                 yield history, None
@@ -350,7 +361,7 @@ Patient Record Excerpts:
     return demo
 if __name__ == "__main__":
-    logger.info("Launching app...")
     agent = init_agent()
     demo = create_ui(agent)
     demo.queue(api_open=False).launch(

 import multiprocessing
 from functools import partial
 import time
 # Persistent directory
 persistent_dir = "/data/hf_cache"
     os.makedirs(directory, exist_ok=True)
 os.environ["HF_HOME"] = model_cache_dir
+os.environ["TRANSFORMERS_CACHE"] = model_cache_dir
 os.environ["VLLM_CACHE_DIR"] = vllm_cache_dir
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 current_dir = os.path.dirname(os.path.abspath(__file__))
 src_path = os.path.abspath(os.path.join(current_dir, "src"))
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
 def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
     """Extract text from a range of PDF pages."""
     try:
                 page_text = page.extract_text() or ""
                 text_chunks.append(f"=== Page {start_page + pdf.pages.index(page) + 1} ===\n{page_text.strip()}")
         return "\n\n".join(text_chunks)
+    except Exception:
         return ""
 def extract_all_pages(file_path: str, progress_callback=None) -> str:
         if total_pages == 0:
             return ""
+        # Use 6 processes (adjust based on CPU cores)
         num_processes = min(6, multiprocessing.cpu_count())
         pages_per_process = max(1, total_pages // num_processes)
+        # Create page ranges for parallel processing
         ranges = [(i * pages_per_process, min((i + 1) * pages_per_process, total_pages))
                   for i in range(num_processes)]
         if ranges[-1][1] != total_pages:
             ranges[-1] = (ranges[-1][0], total_pages)
+        # Process page ranges in parallel
         with multiprocessing.Pool(processes=num_processes) as pool:
             extract_func = partial(extract_page_range, file_path)
             results = []
         return "\n\n".join(filter(None, results))
     except Exception as e:
         return f"PDF processing error: {str(e)}"
 def convert_file_to_json(file_path: str, file_type: str, progress_callback=None) -> str:
             f.write(result)
         return result
     except Exception as e:
         return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
 def log_system_usage(tag=""):
     try:
         cpu = psutil.cpu_percent(interval=1)
         mem = psutil.virtual_memory()
+        print(f"[{tag}] CPU: {cpu}% | RAM: {mem.used // (1024**2)}MB / {mem.total // (1024**2)}MB")
         result = subprocess.run(
             ["nvidia-smi", "--query-gpu=memory.used,memory.total,utilization.gpu", "--format=csv,nounits,noheader"],
             capture_output=True, text=True
         )
         if result.returncode == 0:
             used, total, util = result.stdout.strip().split(", ")
+            print(f"[{tag}] GPU: {used}MB / {total}MB | Utilization: {util}%")
     except Exception as e:
+        print(f"[{tag}] GPU/CPU monitor failed: {e}")
 def clean_response(text: str) -> str:
+    """Clean TxAgent response to group findings under tool-derived headings."""
     text = sanitize_utf8(text)
+    # Remove tool call artifacts, None, and reasoning
+    text = re.sub(r"\[.*?\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
+    # Remove extra whitespace and non-markdown content
     text = re.sub(r"\n{3,}", "\n\n", text)
+    text = re.sub(r"[^\n#\-\*\w\s\.\,\:\(\)]+", "", text)  # Keep markdown-relevant characters
+    # Define tool-to-heading mapping
+    tool_to_heading = {
+        "get_abuse_info_by_drug_name": "Drugs",
+        "get_dependence_info_by_drug_name": "Drugs",
+        "get_abuse_types_and_related_adverse_reactions_and_controlled_substance_status_by_drug_name": "Drugs",
+        "get_info_for_patients_by_drug_name": "Drugs",
+        # Add other tools from new_tool.json if applicable
+    }
+    # Parse sections and findings
     sections = {}
     current_section = None
+    current_tool = None
     lines = text.splitlines()
     for line in lines:
         line = line.strip()
         if not line:
             continue
+        # Detect tool tag
+        tool_match = re.match(r"\[TOOL:\s*(\w+)\]", line)
+        if tool_match:
+            current_tool = tool_match.group(1)
+            continue
+        # Detect section heading
+        section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
         if section_match:
             current_section = section_match.group(1)
             if current_section not in sections:
                 sections[current_section] = []
             continue
+        # Detect finding
         finding_match = re.match(r"-\s*.+", line)
         if finding_match and current_section and not re.match(r"-\s*No issues identified", line):
+            # Assign to tool-derived heading if tool is specified
+            if current_tool and current_tool in tool_to_heading:
+                heading = tool_to_heading[current_tool]
+                if heading not in sections:
+                    sections[heading] = []
+                sections[heading].append(line)
+            else:
+                sections[current_section].append(line)
+    # Combine non-empty sections
     cleaned = []
     for heading, findings in sections.items():
+        if findings:  # Only include sections with findings
             cleaned.append(f"### {heading}\n" + "\n".join(findings))
     text = "\n\n".join(cleaned).strip()
     if not text:
+        text = ""  # Return empty string if no valid findings
     return text
 def init_agent():
+    print("🔁 Initializing model...")
     log_system_usage("Before Load")
     default_tool_path = os.path.abspath("data/new_tool.json")
     target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
         tool_files_dict={"new_tool": target_tool_path},
         force_finish=True,
         enable_checker=True,
+        step_rag_num=4,
         seed=100,
         additional_default_tools=[],
     )
     agent.init_model()
     log_system_usage("After Load")
+    print("✅ Agent Ready")
     return agent
 def create_ui(agent):
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
         send_btn = gr.Button("Analyze", variant="primary")
         download_output = gr.File(label="Download Full Report")
         def analyze(message: str, history: List[dict], files: List):
             history.append({"role": "user", "content": message})
             history.append({"role": "assistant", "content": "⏳ Extracting text from files..."})
             extracted = ""
             file_hash_value = ""
             if files:
+                # Progress callback for extraction
                 total_pages = 0
                 processed_pages = 0
                 def update_extraction_progress(current, total):
                     extracted = "\n".join(results)
                     file_hash_value = file_hash(files[0].name) if files else ""
+            history.pop()  # Remove extraction message
             history.append({"role": "assistant", "content": "✅ Text extraction complete."})
             yield history, None
+            # Split extracted text into chunks of ~6,000 characters
+            chunk_size = 6000
             chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
+            combined_response = ""
+            prompt_template = """
+You are a medical analysis assistant. Analyze the following patient record excerpt for clinical oversights and provide a concise, evidence-based summary in markdown format. Group findings under appropriate headings based on the tool used (e.g., drug-related findings under 'Drugs'). For each finding, include:
+- Clinical context (why the issue was missed or relevant details from the record).
+- Potential risks if unaddressed (e.g., disease progression, adverse events).
+- Actionable recommendations (e.g., tests, referrals, medication adjustments).
+Output ONLY the markdown-formatted findings, with bullet points under each heading. Precede each finding with a tool tag (e.g., [TOOL: get_abuse_info_by_drug_name]) to indicate the tool used. Do NOT include reasoning, tool calls, or intermediate steps. If no issues are found for a tool or category, state "No issues identified" for that section. Ensure the output is specific to the provided text and avoids generic responses.
+Example Output:
+### Drugs
+[TOOL: get_abuse_info_by_drug_name]
+- Opioid use disorder not addressed. Missed due to lack of screening. Risks: overdose. Recommend: addiction specialist referral.
+### Missed Diagnoses
+- Elevated BP noted without diagnosis. Missed due to inconsistent visits. Risks: stroke. Recommend: BP monitoring, antihypertensives.
+### Incomplete Assessments
+- Chest pain not evaluated. Time constraints likely cause. Risks: cardiac issues. Recommend: ECG, stress test.
+### Urgent Follow-up
+- Abnormal creatinine not addressed. Delayed lab review. Risks: renal failure. Recommend: nephrology referral.
+Patient Record Excerpt (Chunk {0} of {1}):
+{chunk}
+"""
+            try:
+                # Process each chunk and stream results in real-time
+                for chunk_idx, chunk in enumerate(chunks, 1):
+                    # Update UI with chunk progress
+                    animation = ["🔍", "📊", "🧠", "🔎"][(int(time.time() * 2) % 4)]
+                    history.append({"role": "assistant", "content": f"Analyzing records... {animation} Chunk {chunk_idx}/{len(chunks)}"})
+                    yield history, None
+                    prompt = prompt_template.format(chunk_idx, len(chunks), chunk=chunk[:4000])  # Truncate to avoid token limits
+                    chunk_response = ""
+                    for chunk_output in agent.run_gradio_chat(
+                        message=prompt,
+                        history=[],
+                        temperature=0.2,
+                        max_new_tokens=1024,
+                        max_token=4096,
+                        call_agent=False,
+                        conversation=[],
+                    ):
+                        if chunk_output is None:
+                            continue
+                        if isinstance(chunk_output, list):
+                            for m in chunk_output:
+                                if hasattr(m, 'content') and m.content:
+                                    cleaned = clean_response(m.content)
+                                    if cleaned and re.search(r"###\s*\w+", cleaned):
+                                        chunk_response += cleaned + "\n\n"
+                                        # Update UI with partial response
+                                        if history[-1]["content"].startswith("Analyzing"):
+                                            history[-1] = {"role": "assistant", "content": f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"}
+                                        else:
+                                            history[-1]["content"] = f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"
+                                        yield history, None
+                        elif isinstance(chunk_output, str) and chunk_output.strip():
+                            cleaned = clean_response(chunk_output)
+                            if cleaned and re.search(r"###\s*\w+", cleaned):
+                                chunk_response += cleaned + "\n\n"
+                                # Update UI with partial response
+                                if history[-1]["content"].startswith("Analyzing"):
+                                    history[-1] = {"role": "assistant", "content": f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"}
+                                else:
+                                    history[-1]["content"] = f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"
+                                yield history, None
+                    # Append completed chunk response to combined response
+                    if chunk_response:
+                        combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
+                    else:
+                        combined_response += f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"
+                # Finalize UI with complete response
+                if combined_response.strip() and not all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
+                    history[-1]["content"] = combined_response.strip()
+                else:
+                    history.append({"role": "assistant", "content": "No oversights identified in the provided records."})
+                # Generate report file
                 report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
+                if report_path:
                     with open(report_path, "w", encoding="utf-8") as f:
+                        f.write(combined_response)
                 yield history, report_path if report_path and os.path.exists(report_path) else None
             except Exception as e:
+                print("🚨 ERROR:", e)
                 history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
                 yield history, None
     return demo
 if __name__ == "__main__":
+    print("🚀 Launching app...")
     agent = init_agent()
     demo = create_ui(agent)
     demo.queue(api_open=False).launch(