Ali2206 commited on
Commit
cbd84d4
·
verified ·
1 Parent(s): 463c8b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -12
app.py CHANGED
@@ -11,6 +11,8 @@ import shutil
11
  import re
12
  import psutil
13
  import subprocess
 
 
14
  import time
15
 
16
  # Persistent directory
@@ -38,9 +40,6 @@ sys.path.insert(0, src_path)
38
 
39
  from txagent.txagent import TxAgent
40
 
41
- MEDICAL_KEYWORDS = {'diagnosis', 'assessment', 'plan', 'results', 'medications',
42
- 'allergies', 'summary', 'impression', 'findings', 'recommendations'}
43
-
44
  def sanitize_utf8(text: str) -> str:
45
  return text.encode("utf-8", "ignore").decode("utf-8")
46
 
@@ -48,20 +47,48 @@ def file_hash(path: str) -> str:
48
  with open(path, "rb") as f:
49
  return hashlib.md5(f.read()).hexdigest()
50
 
51
- def extract_priority_pages(file_path: str, progress_callback=None) -> str:
 
52
  try:
53
  text_chunks = []
54
  with pdfplumber.open(file_path) as pdf:
55
- total_pages = len(pdf.pages)
56
- processed_pages = 0
57
- for i, page in enumerate(pdf.pages):
58
  page_text = page.extract_text() or ""
59
- if i < 3 or any(re.search(rf'\b{kw}\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
60
- text_chunks.append(f"=== Page {i+1} ===\n{page_text.strip()}")
61
- processed_pages += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  if progress_callback:
 
63
  progress_callback(processed_pages, total_pages)
64
- return "\n\n".join(text_chunks)
 
65
  except Exception as e:
66
  return f"PDF processing error: {str(e)}"
67
 
@@ -74,7 +101,7 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
74
  return f.read()
75
 
76
  if file_type == "pdf":
77
- text = extract_priority_pages(file_path, progress_callback)
78
  result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
79
  elif file_type == "csv":
80
  df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
 
11
  import re
12
  import psutil
13
  import subprocess
14
+ import multiprocessing
15
+ from functools import partial
16
  import time
17
 
18
  # Persistent directory
 
40
 
41
  from txagent.txagent import TxAgent
42
 
 
 
 
43
  def sanitize_utf8(text: str) -> str:
44
  return text.encode("utf-8", "ignore").decode("utf-8")
45
 
 
47
  with open(path, "rb") as f:
48
  return hashlib.md5(f.read()).hexdigest()
49
 
50
+ def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
51
+ """Extract text from a range of PDF pages."""
52
  try:
53
  text_chunks = []
54
  with pdfplumber.open(file_path) as pdf:
55
+ for page in pdf.pages[start_page:end_page]:
 
 
56
  page_text = page.extract_text() or ""
57
+ text_chunks.append(f"=== Page {start_page + pdf.pages.index(page) + 1} ===\n{page_text.strip()}")
58
+ return "\n\n".join(text_chunks)
59
+ except Exception:
60
+ return ""
61
+
62
+ def extract_all_pages(file_path: str, progress_callback=None) -> str:
63
+ """Extract text from all pages of a PDF using parallel processing."""
64
+ try:
65
+ with pdfplumber.open(file_path) as pdf:
66
+ total_pages = len(pdf.pages)
67
+
68
+ if total_pages == 0:
69
+ return ""
70
+
71
+ # Use 6 processes (adjust based on CPU cores)
72
+ num_processes = min(6, multiprocessing.cpu_count())
73
+ pages_per_process = max(1, total_pages // num_processes)
74
+
75
+ # Create page ranges for parallel processing
76
+ ranges = [(i * pages_per_process, min((i + 1) * pages_per_process, total_pages))
77
+ for i in range(num_processes)]
78
+ if ranges[-1][1] != total_pages:
79
+ ranges[-1] = (ranges[-1][0], total_pages)
80
+
81
+ # Process page ranges in parallel
82
+ with multiprocessing.Pool(processes=num_processes) as pool:
83
+ extract_func = partial(extract_page_range, file_path)
84
+ results = []
85
+ for idx, result in enumerate(pool.starmap(extract_func, ranges)):
86
+ results.append(result)
87
  if progress_callback:
88
+ processed_pages = min((idx + 1) * pages_per_process, total_pages)
89
  progress_callback(processed_pages, total_pages)
90
+
91
+ return "\n\n".join(filter(None, results))
92
  except Exception as e:
93
  return f"PDF processing error: {str(e)}"
94
 
 
101
  return f.read()
102
 
103
  if file_type == "pdf":
104
+ text = extract_all_pages(file_path, progress_callback)
105
  result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
106
  elif file_type == "csv":
107
  df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,