CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 18

Commit

6741b3e

verified ·

1 Parent(s): 9277e15

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -87

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import sys
 import os
 import pandas as pd
-import pdfplumber
 import json
 import gradio as gr
 from typing import List
@@ -16,9 +15,15 @@ import torch
 import gc
 from diskcache import Cache
 import time
-# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Persistent directory
@@ -56,37 +61,45 @@ def file_hash(path: str) -> str:
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
-def extract_all_pages(file_path: str, progress_callback=None) -> str:
     try:
-        with pdfplumber.open(file_path) as pdf:
-            total_pages = len(pdf.pages)
-            if total_pages == 0:
-                return ""
-        batch_size = 10
         batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
         text_chunks = [""] * total_pages
         processed_pages = 0
         def extract_batch(start: int, end: int) -> List[tuple]:
             results = []
-            with pdfplumber.open(file_path) as pdf:
-                for page in pdf.pages[start:end]:
-                    page_num = start + pdf.pages.index(page)
-                    page_text = page.extract_text() or ""
-                    results.append((page_num, f"=== Page {page_num + 1} ===\n{page_text.strip()}"))
             return results
-        with ThreadPoolExecutor(max_workers=6) as executor:
-            futures = [executor.submit(extract_batch, start, end) for start, end in batches]
-            for future in as_completed(futures):
-                for page_num, text in future.result():
                     text_chunks[page_num] = text
                 processed_pages += batch_size
                 if progress_callback:
                     progress_callback(min(processed_pages, total_pages), total_pages)
-        return "\n\n".join(filter(None, text_chunks))
     except Exception as e:
         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
@@ -96,10 +109,15 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
         file_h = file_hash(file_path)
         cache_key = f"{file_h}_{file_type}"
         if cache_key in cache:
             return cache[cache_key]
         if file_type == "pdf":
-            text = extract_all_pages(file_path, progress_callback)
             result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
         elif file_type == "csv":
             df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
@@ -117,6 +135,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
         cache[cache_key] = result
         return result
     except Exception as e:
         logger.error("Error processing %s: %s", os.path.basename(file_path), e)
@@ -139,66 +158,49 @@ def log_system_usage(tag=""):
 def clean_response(text: str) -> str:
     text = sanitize_utf8(text)
-    text = re.sub(r"\[.*?\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
-    text = re.sub(r"\n{3,}", "\n\n", text)
-    text = re.sub(r"[^\n#\-\*\w\s\.\,\:\(\)]+", "", text)
     sections = {}
     current_section = None
-    lines = text.splitlines()
-    for line in lines:
         line = line.strip()
         if not line:
             continue
         section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
         if section_match:
             current_section = section_match.group(1)
-            if current_section not in sections:
-                sections[current_section] = []
             continue
-        finding_match = re.match(r"-\s*.+", line)
-        if finding_match and current_section and not re.match(r"-\s*No issues identified", line):
             sections[current_section].append(line)
-    cleaned = []
-    for heading, findings in sections.items():
-        if findings:
-            cleaned.append(f"### {heading}\n" + "\n".join(findings))
-    text = "\n\n".join(cleaned).strip()
-    return text if text else ""
 def summarize_findings(combined_response: str) -> str:
     if not combined_response or all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
         return "### Summary of Clinical Oversights\nNo critical oversights identified in the provided records."
     sections = {}
-    lines = combined_response.splitlines()
     current_section = None
-    for line in lines:
         line = line.strip()
         if not line:
             continue
         section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
         if section_match:
             current_section = section_match.group(1)
-            if current_section not in sections:
-                sections[current_section] = []
             continue
-        finding_match = re.match(r"-\s*(.+)", line)
-        if finding_match and current_section:
-            sections[current_section].append(finding_match.group(1))
-    summary_lines = []
-    for heading, findings in sections.items():
-        if findings:
-            summary = f"- **{heading}**: {'; '.join(findings[:2])}. Risks: {heading.lower()} may lead to adverse outcomes. Recommend: urgent review and specialist referral."
-            summary_lines.append(summary)
-    if not summary_lines:
-        return "### Summary of Clinical Oversights\nNo critical oversights identified."
-    return "### Summary of Clinical Oversights\n" + "\n".join(summary_lines)
 def init_agent():
     logger.info("Initializing model...")
@@ -214,7 +216,9 @@ def init_agent():
         tool_files_dict={"new_tool": target_tool_path},
         force_finish=True,
         enable_checker=False,
-        step_rag_num=4,
         seed=100,
         additional_default_tools=[],
     )
@@ -241,7 +245,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
 {chunk}
 """
-        def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
             history.append({"role": "user", "content": message})
             yield history, None, ""
@@ -252,56 +256,61 @@ Patient Record Excerpt (Chunk {0} of {1}):
                     progress(current / total, desc=f"Extracting text... Page {current}/{total}")
                     return history, None, ""
-                with ThreadPoolExecutor(max_workers=6) as executor:
-                    futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
-                    results = [sanitize_utf8(f.result()) for f in as_completed(futures)]
-                    extracted = "\n".join(results)
-                    file_hash_value = file_hash(files[0].name) if files else ""
                 history.append({"role": "assistant", "content": "✅ Text extraction complete."})
                 yield history, None, ""
-            chunk_size = 6000
             chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
             combined_response = ""
             batch_size = 2
             try:
                 for batch_idx in range(0, len(chunks), batch_size):
                     batch_chunks = chunks[batch_idx:batch_idx + batch_size]
-                    batch_prompts = [prompt_template.format(i + 1, len(chunks), chunk=chunk[:4000]) for i, chunk in enumerate(batch_chunks)]
                     batch_responses = []
                     progress((batch_idx + 1) / len(chunks), desc=f"Analyzing chunks {batch_idx + 1}-{min(batch_idx + batch_size, len(chunks))}/{len(chunks)}")
-                    with ThreadPoolExecutor(max_workers=len(batch_chunks)) as executor:
-                        futures = [executor.submit(agent.run_gradio_chat, prompt, [], 0.2, 512, 2048, False, []) for prompt in batch_prompts]
-                        for future in as_completed(futures):
-                            chunk_response = ""
-                            for chunk_output in future.result():
-                                if chunk_output is None:
-                                    continue
-                                if isinstance(chunk_output, list):
-                                    for m in chunk_output:
-                                        if hasattr(m, 'content') and m.content:
-                                            cleaned = clean_response(m.content)
-                                            if cleaned and re.search(r"###\s*\w+", cleaned):
-                                                chunk_response += cleaned + "\n\n"
-                                elif isinstance(chunk_output, str) and chunk_output.strip():
-                                    cleaned = clean_response(m.content)
-                                    if cleaned and re.search(r"###\s*\w+", cleaned):
-                                        chunk_response += cleaned + "\n\n"
-                            batch_responses.append(chunk_response)
-                            torch.cuda.empty_cache()
-                            gc.collect()
                     for chunk_idx, chunk_response in enumerate(batch_responses, batch_idx + 1):
                         if chunk_response:
                             combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
                         else:
                             combined_response += f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"
-                        history[-1] = {"role": "assistant", "content": combined_response.strip()}
-                        yield history, None, ""
                 if combined_response.strip() and not all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
                     history[-1]["content"] = combined_response.strip()

 import sys
 import os
 import pandas as pd
 import json
 import gradio as gr
 from typing import List
 import gc
 from diskcache import Cache
 import time
+import asyncio
+import pypdfium2 as pdfium
+import pytesseract
+from PIL import Image
+import io
+# Configure logging and suppress warnings
 logging.basicConfig(level=logging.INFO)
+logging.getLogger("pdfminer").setLevel(logging.ERROR)
 logger = logging.getLogger(__name__)
 # Persistent directory
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
+async def extract_all_pages_async(file_path: str, progress_callback=None, use_ocr=False) -> str:
     try:
+        pdf = pdfium.PdfDocument(file_path)
+        total_pages = len(pdf)
+        if total_pages == 0:
+            return ""
+        batch_size = 5
         batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
         text_chunks = [""] * total_pages
         processed_pages = 0
         def extract_batch(start: int, end: int) -> List[tuple]:
             results = []
+            for i in range(start, end):
+                page = pdf[i]
+                text = page.get_textpage().get_text_range() or ""
+                if not text.strip() and use_ocr:
+                    # Fallback to OCR
+                    bitmap = page.render(scale=2).to_pil()
+                    text = pytesseract.image_to_string(bitmap, lang="eng")
+                results.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
             return results
+        loop = asyncio.get_event_loop()
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            futures = [loop.run_in_executor(executor, extract_batch, start, end) for start, end in batches]
+            for future in await asyncio.gather(*futures):
+                for page_num, text in future:
                     text_chunks[page_num] = text
+                    logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
                 processed_pages += batch_size
                 if progress_callback:
                     progress_callback(min(processed_pages, total_pages), total_pages)
+        pdf.close()
+        extracted_text = "\n\n".join(filter(None, text_chunks))
+        logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
+        return extracted_text
     except Exception as e:
         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
         file_h = file_hash(file_path)
         cache_key = f"{file_h}_{file_type}"
         if cache_key in cache:
+            logger.info("Using cached extraction for %s", file_path)
             return cache[cache_key]
         if file_type == "pdf":
+            # Try without OCR first, fallback to OCR if empty
+            text = asyncio.run(extract_all_pages_async(file_path, progress_callback, use_ocr=False))
+            if not text.strip() or "PDF processing error" in text:
+                logger.info("Retrying extraction with OCR for %s", file_path)
+                text = asyncio.run(extract_all_pages_async(file_path, progress_callback, use_ocr=True))
             result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
         elif file_type == "csv":
             df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
         cache[cache_key] = result
+        logger.info("Cached extraction for %s, size: %d bytes", file_path, len(result))
         return result
     except Exception as e:
         logger.error("Error processing %s: %s", os.path.basename(file_path), e)
 def clean_response(text: str) -> str:
     text = sanitize_utf8(text)
+    text = text.replace("[", "").replace("]", "").replace("None", "")  # Faster string ops
+    text = text.replace("\n\n\n", "\n\n")
     sections = {}
     current_section = None
+    for line in text.splitlines():
         line = line.strip()
         if not line:
             continue
         section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
         if section_match:
             current_section = section_match.group(1)
+            sections.setdefault(current_section, [])
             continue
+        if current_section and line.startswith("- ") and "No issues identified" not in line:
             sections[current_section].append(line)
+    cleaned = [f"### {heading}\n" + "\n".join(findings) for heading, findings in sections.items() if findings]
+    result = "\n\n".join(cleaned).strip()
+    logger.debug("Cleaned response length: %d chars", len(result))
+    return result or ""
 def summarize_findings(combined_response: str) -> str:
     if not combined_response or all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
         return "### Summary of Clinical Oversights\nNo critical oversights identified in the provided records."
     sections = {}
     current_section = None
+    for line in combined_response.splitlines():
         line = line.strip()
         if not line:
             continue
         section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
         if section_match:
             current_section = section_match.group(1)
+            sections.setdefault(current_section, [])
             continue
+        if current_section and line.startswith("- "):
+            sections[current_section].append(line[2:])
+    summary_lines = [
+        f"- **{heading}**: {'; '.join(findings[:1])}. Risks: potential adverse outcomes. Recommend: urgent review."
+        for heading, findings in sections.items() if findings
+    ]
+    result = "### Summary of Clinical Oversights\n" + "\n".join(summary_lines) if summary_lines else "### Summary of Clinical Oversights\nNo critical oversights identified."
+    logger.debug("Summary length: %d chars", len(result))
+    return result
 def init_agent():
     logger.info("Initializing model...")
         tool_files_dict={"new_tool": target_tool_path},
         force_finish=True,
         enable_checker=False,
+        enable_rag=False,
+        init_rag_num=0,
+        step_rag_num=0,
         seed=100,
         additional_default_tools=[],
     )
 {chunk}
 """
+        async def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
             history.append({"role": "user", "content": message})
             yield history, None, ""
                     progress(current / total, desc=f"Extracting text... Page {current}/{total}")
                     return history, None, ""
+                futures = [convert_file_to_json(f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
+                results = [sanitize_utf8(future) for future in futures]
+                extracted = "\n".join(results)
+                file_hash_value = file_hash(files[0].name) if files else ""
                 history.append({"role": "assistant", "content": "✅ Text extraction complete."})
                 yield history, None, ""
+                logger.info("Extracted text length: %d chars", len(extracted))
+            chunk_size = 4000  # Increased slightly
             chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
+            logger.info("Created %d chunks", len(chunks))
             combined_response = ""
             batch_size = 2
             try:
                 for batch_idx in range(0, len(chunks), batch_size):
                     batch_chunks = chunks[batch_idx:batch_idx + batch_size]
+                    batch_prompts = [prompt_template.format(i + 1, len(chunks), chunk=chunk[:2000]) for i, chunk in enumerate(batch_chunks)]
                     batch_responses = []
                     progress((batch_idx + 1) / len(chunks), desc=f"Analyzing chunks {batch_idx + 1}-{min(batch_idx + batch_size, len(chunks))}/{len(chunks)}")
+                    async def process_chunk(prompt):
+                        chunk_response = ""
+                        for chunk_output in agent.run_gradio_chat(
+                            message=prompt, history=[], temperature=0.2, max_new_tokens=128, max_token=768, call_agent=False, conversation=[]
+                        ):
+                            if chunk_output is None:
+                                continue
+                            if isinstance(chunk_output, list):
+                                for m in chunk_output:
+                                    if hasattr(m, 'content') and m.content:
+                                        cleaned = clean_response(m.content)
+                                        if cleaned and re.search(r"###\s*\w+", cleaned):
+                                            chunk_response += cleaned + "\n\n"
+                            elif isinstance(chunk_output, str) and chunk_output.strip():
+                                cleaned = clean_response(chunk_output)
+                                if cleaned and re.search(r"###\s*\w+", cleaned):
+                                    chunk_response += cleaned + "\n\n"
+                        logger.debug("Chunk response length: %d chars", len(chunk_response))
+                        return chunk_response
+                    futures = [process_chunk(prompt) for prompt in batch_prompts]
+                    batch_responses = await asyncio.gather(*futures)
+                    torch.cuda.empty_cache()
+                    gc.collect()
                     for chunk_idx, chunk_response in enumerate(batch_responses, batch_idx + 1):
                         if chunk_response:
                             combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
                         else:
                             combined_response += f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"
+                    history[-1] = {"role": "assistant", "content": combined_response.strip()}
+                    yield history, None, ""
                 if combined_response.strip() and not all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
                     history[-1]["content"] = combined_response.strip()