Ali2206 commited on
Commit
90e24e0
·
verified ·
1 Parent(s): 4d00da5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -5
app.py CHANGED
@@ -6,6 +6,12 @@ import gradio as gr
6
  from typing import List, Dict
7
  from concurrent.futures import ThreadPoolExecutor
8
  import hashlib
 
 
 
 
 
 
9
 
10
  # Persistent directories
11
  persistent_dir = "/data/hf_cache"
@@ -24,18 +30,46 @@ def file_hash(path: str) -> str:
24
  with open(path, "rb") as f:
25
  return hashlib.md5(f.read()).hexdigest()
26
 
27
- def extract_all_pages(file_path: str) -> str:
28
- """Extract text from all pages of a PDF."""
29
  try:
30
  text_chunks = []
31
  with pdfplumber.open(file_path) as pdf:
32
- for page in pdf.pages:
33
  page_text = page.extract_text() or ""
34
  text_chunks.append(page_text.strip())
35
  return "\n".join(text_chunks)
36
  except Exception:
37
  return ""
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def convert_file_to_text(file_path: str, file_type: str) -> str:
40
  """Convert supported file types to text, caching results."""
41
  try:
@@ -58,6 +92,8 @@ def convert_file_to_text(file_path: str, file_type: str) -> str:
58
  text = ""
59
 
60
  if text:
 
 
61
  with open(cache_path, "w", encoding="utf-8") as f:
62
  f.write(text)
63
  return text
@@ -149,7 +185,7 @@ def create_ui():
149
  def analyze(message: str, history: List[dict], files: List):
150
  """Handle analysis and return results."""
151
  history.append({"role": "user", "content": message})
152
- history.append({"role": "assistant", "content": "⏳ Analyzing..."})
153
  yield history, None
154
 
155
  extracted_text = ""
@@ -161,17 +197,22 @@ def create_ui():
161
  extracted_text = "\n".join(sanitize_utf8(r) for r in results if r)
162
  file_hash_value = file_hash(files[0].name) if files else ""
163
 
164
- history.pop() # Remove "Analyzing..."
 
 
 
165
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
166
 
167
  try:
168
  response = analyze_medical_records(extracted_text)
 
169
  history.append({"role": "assistant", "content": response})
170
  if report_path:
171
  with open(report_path, "w", encoding="utf-8") as f:
172
  f.write(response)
173
  yield history, report_path if report_path and os.path.exists(report_path) else None
174
  except Exception as e:
 
175
  history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
176
  yield history, None
177
 
 
6
  from typing import List, Dict
7
  from concurrent.futures import ThreadPoolExecutor
8
  import hashlib
9
+ import multiprocessing
10
+ from functools import partial
11
+ import logging
12
+
13
+ # Suppress pdfplumber CropBox warnings
14
+ logging.getLogger("pdfplumber").setLevel(logging.ERROR)
15
 
16
  # Persistent directories
17
  persistent_dir = "/data/hf_cache"
 
30
  with open(path, "rb") as f:
31
  return hashlib.md5(f.read()).hexdigest()
32
 
33
+ def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
34
+ """Extract text from a range of PDF pages."""
35
  try:
36
  text_chunks = []
37
  with pdfplumber.open(file_path) as pdf:
38
+ for page in pdf.pages[start_page:end_page]:
39
  page_text = page.extract_text() or ""
40
  text_chunks.append(page_text.strip())
41
  return "\n".join(text_chunks)
42
  except Exception:
43
  return ""
44
 
45
+ def extract_all_pages(file_path: str) -> str:
46
+ """Extract text from all pages of a PDF using parallel processing."""
47
+ try:
48
+ with pdfplumber.open(file_path) as pdf:
49
+ total_pages = len(pdf.pages)
50
+
51
+ if total_pages == 0:
52
+ return ""
53
+
54
+ # Use 4 processes (adjust based on CPU cores)
55
+ num_processes = min(4, multiprocessing.cpu_count())
56
+ pages_per_process = max(1, total_pages // num_processes)
57
+
58
+ # Create page ranges for parallel processing
59
+ ranges = [(i * pages_per_process, min((i + 1) * pages_per_process, total_pages))
60
+ for i in range(num_processes)]
61
+ if ranges[-1][1] != total_pages:
62
+ ranges[-1] = (ranges[-1][0], total_pages)
63
+
64
+ # Process page ranges in parallel
65
+ with multiprocessing.Pool(processes=num_processes) as pool:
66
+ extract_func = partial(extract_page_range, file_path)
67
+ results = pool.starmap(extract_func, ranges)
68
+
69
+ return "\n".join(filter(None, results))
70
+ except Exception:
71
+ return ""
72
+
73
  def convert_file_to_text(file_path: str, file_type: str) -> str:
74
  """Convert supported file types to text, caching results."""
75
  try:
 
92
  text = ""
93
 
94
  if text:
95
+ # Compress text by removing redundant whitespace
96
+ text = re.sub(r'\s+', ' ', text).strip()
97
  with open(cache_path, "w", encoding="utf-8") as f:
98
  f.write(text)
99
  return text
 
185
  def analyze(message: str, history: List[dict], files: List):
186
  """Handle analysis and return results."""
187
  history.append({"role": "user", "content": message})
188
+ history.append({"role": "assistant", "content": "⏳ Extracting text from files..."})
189
  yield history, None
190
 
191
  extracted_text = ""
 
197
  extracted_text = "\n".join(sanitize_utf8(r) for r in results if r)
198
  file_hash_value = file_hash(files[0].name) if files else ""
199
 
200
+ history.pop() # Remove "Extracting..."
201
+ history.append({"role": "assistant", "content": "⏳ Analyzing medical records..."})
202
+ yield history, None
203
+
204
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
205
 
206
  try:
207
  response = analyze_medical_records(extracted_text)
208
+ history.pop() # Remove "Analyzing..."
209
  history.append({"role": "assistant", "content": response})
210
  if report_path:
211
  with open(report_path, "w", encoding="utf-8") as f:
212
  f.write(response)
213
  yield history, report_path if report_path and os.path.exists(report_path) else None
214
  except Exception as e:
215
+ history.pop() # Remove "Analyzing..."
216
  history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
217
  yield history, None
218