Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,8 @@ import shutil
|
|
11 |
import re
|
12 |
import psutil
|
13 |
import subprocess
|
|
|
|
|
14 |
import time
|
15 |
|
16 |
# Persistent directory
|
@@ -38,9 +40,6 @@ sys.path.insert(0, src_path)
|
|
38 |
|
39 |
from txagent.txagent import TxAgent
|
40 |
|
41 |
-
MEDICAL_KEYWORDS = {'diagnosis', 'assessment', 'plan', 'results', 'medications',
|
42 |
-
'allergies', 'summary', 'impression', 'findings', 'recommendations'}
|
43 |
-
|
44 |
def sanitize_utf8(text: str) -> str:
|
45 |
return text.encode("utf-8", "ignore").decode("utf-8")
|
46 |
|
@@ -48,20 +47,48 @@ def file_hash(path: str) -> str:
|
|
48 |
with open(path, "rb") as f:
|
49 |
return hashlib.md5(f.read()).hexdigest()
|
50 |
|
51 |
-
def
|
|
|
52 |
try:
|
53 |
text_chunks = []
|
54 |
with pdfplumber.open(file_path) as pdf:
|
55 |
-
|
56 |
-
processed_pages = 0
|
57 |
-
for i, page in enumerate(pdf.pages):
|
58 |
page_text = page.extract_text() or ""
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
if progress_callback:
|
|
|
63 |
progress_callback(processed_pages, total_pages)
|
64 |
-
|
|
|
65 |
except Exception as e:
|
66 |
return f"PDF processing error: {str(e)}"
|
67 |
|
@@ -74,7 +101,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
|
|
74 |
return f.read()
|
75 |
|
76 |
if file_type == "pdf":
|
77 |
-
text =
|
78 |
result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
|
79 |
elif file_type == "csv":
|
80 |
df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
|
|
|
11 |
import re
|
12 |
import psutil
|
13 |
import subprocess
|
14 |
+
import multiprocessing
|
15 |
+
from functools import partial
|
16 |
import time
|
17 |
|
18 |
# Persistent directory
|
|
|
40 |
|
41 |
from txagent.txagent import TxAgent
|
42 |
|
|
|
|
|
|
|
43 |
def sanitize_utf8(text: str) -> str:
|
44 |
return text.encode("utf-8", "ignore").decode("utf-8")
|
45 |
|
|
|
47 |
with open(path, "rb") as f:
|
48 |
return hashlib.md5(f.read()).hexdigest()
|
49 |
|
50 |
+
def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
|
51 |
+
"""Extract text from a range of PDF pages."""
|
52 |
try:
|
53 |
text_chunks = []
|
54 |
with pdfplumber.open(file_path) as pdf:
|
55 |
+
for page in pdf.pages[start_page:end_page]:
|
|
|
|
|
56 |
page_text = page.extract_text() or ""
|
57 |
+
text_chunks.append(f"=== Page {start_page + pdf.pages.index(page) + 1} ===\n{page_text.strip()}")
|
58 |
+
return "\n\n".join(text_chunks)
|
59 |
+
except Exception:
|
60 |
+
return ""
|
61 |
+
|
62 |
+
def extract_all_pages(file_path: str, progress_callback=None) -> str:
|
63 |
+
"""Extract text from all pages of a PDF using parallel processing."""
|
64 |
+
try:
|
65 |
+
with pdfplumber.open(file_path) as pdf:
|
66 |
+
total_pages = len(pdf.pages)
|
67 |
+
|
68 |
+
if total_pages == 0:
|
69 |
+
return ""
|
70 |
+
|
71 |
+
# Use 6 processes (adjust based on CPU cores)
|
72 |
+
num_processes = min(6, multiprocessing.cpu_count())
|
73 |
+
pages_per_process = max(1, total_pages // num_processes)
|
74 |
+
|
75 |
+
# Create page ranges for parallel processing
|
76 |
+
ranges = [(i * pages_per_process, min((i + 1) * pages_per_process, total_pages))
|
77 |
+
for i in range(num_processes)]
|
78 |
+
if ranges[-1][1] != total_pages:
|
79 |
+
ranges[-1] = (ranges[-1][0], total_pages)
|
80 |
+
|
81 |
+
# Process page ranges in parallel
|
82 |
+
with multiprocessing.Pool(processes=num_processes) as pool:
|
83 |
+
extract_func = partial(extract_page_range, file_path)
|
84 |
+
results = []
|
85 |
+
for idx, result in enumerate(pool.starmap(extract_func, ranges)):
|
86 |
+
results.append(result)
|
87 |
if progress_callback:
|
88 |
+
processed_pages = min((idx + 1) * pages_per_process, total_pages)
|
89 |
progress_callback(processed_pages, total_pages)
|
90 |
+
|
91 |
+
return "\n\n".join(filter(None, results))
|
92 |
except Exception as e:
|
93 |
return f"PDF processing error: {str(e)}"
|
94 |
|
|
|
101 |
return f.read()
|
102 |
|
103 |
if file_type == "pdf":
|
104 |
+
text = extract_all_pages(file_path, progress_callback)
|
105 |
result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
|
106 |
elif file_type == "csv":
|
107 |
df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
|