CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 17

Commit

90e24e0

verified ·

1 Parent(s): 4d00da5

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -5

app.py CHANGED Viewed

@@ -6,6 +6,12 @@ import gradio as gr
 from typing import List, Dict
 from concurrent.futures import ThreadPoolExecutor
 import hashlib
 # Persistent directories
 persistent_dir = "/data/hf_cache"
@@ -24,18 +30,46 @@ def file_hash(path: str) -> str:
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
-def extract_all_pages(file_path: str) -> str:
-    """Extract text from all pages of a PDF."""
     try:
         text_chunks = []
         with pdfplumber.open(file_path) as pdf:
-            for page in pdf.pages:
                 page_text = page.extract_text() or ""
                 text_chunks.append(page_text.strip())
         return "\n".join(text_chunks)
     except Exception:
         return ""
 def convert_file_to_text(file_path: str, file_type: str) -> str:
     """Convert supported file types to text, caching results."""
     try:
@@ -58,6 +92,8 @@ def convert_file_to_text(file_path: str, file_type: str) -> str:
             text = ""
         if text:
             with open(cache_path, "w", encoding="utf-8") as f:
                 f.write(text)
         return text
@@ -149,7 +185,7 @@ def create_ui():
     def analyze(message: str, history: List[dict], files: List):
         """Handle analysis and return results."""
         history.append({"role": "user", "content": message})
-        history.append({"role": "assistant", "content": "⏳ Analyzing..."})
         yield history, None
         extracted_text = ""
@@ -161,17 +197,22 @@ def create_ui():
                 extracted_text = "\n".join(sanitize_utf8(r) for r in results if r)
                 file_hash_value = file_hash(files[0].name) if files else ""
-        history.pop()  # Remove "Analyzing..."
         report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
         try:
             response = analyze_medical_records(extracted_text)
             history.append({"role": "assistant", "content": response})
             if report_path:
                 with open(report_path, "w", encoding="utf-8") as f:
                     f.write(response)
             yield history, report_path if report_path and os.path.exists(report_path) else None
         except Exception as e:
             history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
             yield history, None

 from typing import List, Dict
 from concurrent.futures import ThreadPoolExecutor
 import hashlib
+import multiprocessing
+from functools import partial
+import logging
+# Suppress pdfplumber CropBox warnings
+logging.getLogger("pdfplumber").setLevel(logging.ERROR)
 # Persistent directories
 persistent_dir = "/data/hf_cache"
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
+def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
+    """Extract text from a range of PDF pages."""
     try:
         text_chunks = []
         with pdfplumber.open(file_path) as pdf:
+            for page in pdf.pages[start_page:end_page]:
                 page_text = page.extract_text() or ""
                 text_chunks.append(page_text.strip())
         return "\n".join(text_chunks)
     except Exception:
         return ""
+def extract_all_pages(file_path: str) -> str:
+    """Extract text from all pages of a PDF using parallel processing."""
+    try:
+        with pdfplumber.open(file_path) as pdf:
+            total_pages = len(pdf.pages)
+        if total_pages == 0:
+            return ""
+        # Use 4 processes (adjust based on CPU cores)
+        num_processes = min(4, multiprocessing.cpu_count())
+        pages_per_process = max(1, total_pages // num_processes)
+        # Create page ranges for parallel processing
+        ranges = [(i * pages_per_process, min((i + 1) * pages_per_process, total_pages))
+                  for i in range(num_processes)]
+        if ranges[-1][1] != total_pages:
+            ranges[-1] = (ranges[-1][0], total_pages)
+        # Process page ranges in parallel
+        with multiprocessing.Pool(processes=num_processes) as pool:
+            extract_func = partial(extract_page_range, file_path)
+            results = pool.starmap(extract_func, ranges)
+        return "\n".join(filter(None, results))
+    except Exception:
+        return ""
 def convert_file_to_text(file_path: str, file_type: str) -> str:
     """Convert supported file types to text, caching results."""
     try:
             text = ""
         if text:
+            # Compress text by removing redundant whitespace
+            text = re.sub(r'\s+', ' ', text).strip()
             with open(cache_path, "w", encoding="utf-8") as f:
                 f.write(text)
         return text
     def analyze(message: str, history: List[dict], files: List):
         """Handle analysis and return results."""
         history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": "⏳ Extracting text from files..."})
         yield history, None
         extracted_text = ""
                 extracted_text = "\n".join(sanitize_utf8(r) for r in results if r)
                 file_hash_value = file_hash(files[0].name) if files else ""
+        history.pop()  # Remove "Extracting..."
+        history.append({"role": "assistant", "content": "⏳ Analyzing medical records..."})
+        yield history, None
         report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
         try:
             response = analyze_medical_records(extracted_text)
+            history.pop()  # Remove "Analyzing..."
             history.append({"role": "assistant", "content": response})
             if report_path:
                 with open(report_path, "w", encoding="utf-8") as f:
                     f.write(response)
             yield history, report_path if report_path and os.path.exists(report_path) else None
         except Exception as e:
+            history.pop()  # Remove "Analyzing..."
             history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
             yield history, None