CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 17

Commit

cbd84d4

verified ·

1 Parent(s): 463c8b4

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -12

app.py CHANGED Viewed

@@ -11,6 +11,8 @@ import shutil
 import re
 import psutil
 import subprocess
 import time
 # Persistent directory
@@ -38,9 +40,6 @@ sys.path.insert(0, src_path)
 from txagent.txagent import TxAgent
-MEDICAL_KEYWORDS = {'diagnosis', 'assessment', 'plan', 'results', 'medications',
-                    'allergies', 'summary', 'impression', 'findings', 'recommendations'}
 def sanitize_utf8(text: str) -> str:
     return text.encode("utf-8", "ignore").decode("utf-8")
@@ -48,20 +47,48 @@ def file_hash(path: str) -> str:
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
-def extract_priority_pages(file_path: str, progress_callback=None) -> str:
     try:
         text_chunks = []
         with pdfplumber.open(file_path) as pdf:
-            total_pages = len(pdf.pages)
-            processed_pages = 0
-            for i, page in enumerate(pdf.pages):
                 page_text = page.extract_text() or ""
-                if i < 3 or any(re.search(rf'\b{kw}\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
-                    text_chunks.append(f"=== Page {i+1} ===\n{page_text.strip()}")
-                processed_pages += 1
                 if progress_callback:
                     progress_callback(processed_pages, total_pages)
-        return "\n\n".join(text_chunks)
     except Exception as e:
         return f"PDF processing error: {str(e)}"
@@ -74,7 +101,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
                 return f.read()
         if file_type == "pdf":
-            text = extract_priority_pages(file_path, progress_callback)
             result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
         elif file_type == "csv":
             df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,

 import re
 import psutil
 import subprocess
+import multiprocessing
+from functools import partial
 import time
 # Persistent directory
 from txagent.txagent import TxAgent
 def sanitize_utf8(text: str) -> str:
     return text.encode("utf-8", "ignore").decode("utf-8")
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
+def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
+    """Extract text from a range of PDF pages."""
     try:
         text_chunks = []
         with pdfplumber.open(file_path) as pdf:
+            for page in pdf.pages[start_page:end_page]:
                 page_text = page.extract_text() or ""
+                text_chunks.append(f"=== Page {start_page + pdf.pages.index(page) + 1} ===\n{page_text.strip()}")
+        return "\n\n".join(text_chunks)
+    except Exception:
+        return ""
+def extract_all_pages(file_path: str, progress_callback=None) -> str:
+    """Extract text from all pages of a PDF using parallel processing."""
+    try:
+        with pdfplumber.open(file_path) as pdf:
+            total_pages = len(pdf.pages)
+        if total_pages == 0:
+            return ""
+        # Use 6 processes (adjust based on CPU cores)
+        num_processes = min(6, multiprocessing.cpu_count())
+        pages_per_process = max(1, total_pages // num_processes)
+        # Create page ranges for parallel processing
+        ranges = [(i * pages_per_process, min((i + 1) * pages_per_process, total_pages))
+                  for i in range(num_processes)]
+        if ranges[-1][1] != total_pages:
+            ranges[-1] = (ranges[-1][0], total_pages)
+        # Process page ranges in parallel
+        with multiprocessing.Pool(processes=num_processes) as pool:
+            extract_func = partial(extract_page_range, file_path)
+            results = []
+            for idx, result in enumerate(pool.starmap(extract_func, ranges)):
+                results.append(result)
                 if progress_callback:
+                    processed_pages = min((idx + 1) * pages_per_process, total_pages)
                     progress_callback(processed_pages, total_pages)
+        return "\n\n".join(filter(None, results))
     except Exception as e:
         return f"PDF processing error: {str(e)}"
                 return f.read()
         if file_type == "pdf":
+            text = extract_all_pages(file_path, progress_callback)
             result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
         elif file_type == "csv":
             df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,