Ali2206 commited on
Commit
5eb9bf1
·
verified ·
1 Parent(s): c0b195c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -50
app.py CHANGED
@@ -16,10 +16,16 @@ import gc
16
  from diskcache import Cache
17
  import time
18
  import asyncio
19
- import pypdfium2 as pdfium
20
- import pytesseract
21
- from PIL import Image
22
- import io
 
 
 
 
 
 
23
 
24
  # Configure logging and suppress warnings
25
  logging.basicConfig(level=logging.INFO)
@@ -63,40 +69,72 @@ def file_hash(path: str) -> str:
63
 
64
  async def extract_all_pages_async(file_path: str, progress_callback=None, use_ocr=False) -> str:
65
  try:
66
- pdf = pdfium.PdfDocument(file_path)
67
- total_pages = len(pdf)
68
- if total_pages == 0:
69
- return ""
70
-
71
- batch_size = 5
72
- batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
73
- text_chunks = [""] * total_pages
74
- processed_pages = 0
75
-
76
- def extract_batch(start: int, end: int) -> List[tuple]:
77
- results = []
78
- for i in range(start, end):
79
- page = pdf[i]
80
- text = page.get_textpage().get_text_range() or ""
81
- if not text.strip() and use_ocr:
82
- # Fallback to OCR
83
- bitmap = page.render(scale=2).to_pil()
84
- text = pytesseract.image_to_string(bitmap, lang="eng")
85
- results.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
86
- return results
87
-
88
- loop = asyncio.get_event_loop()
89
- with ThreadPoolExecutor(max_workers=4) as executor:
90
- futures = [loop.run_in_executor(executor, extract_batch, start, end) for start, end in batches]
91
- for future in await asyncio.gather(*futures):
92
- for page_num, text in future:
93
- text_chunks[page_num] = text
94
- logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
95
- processed_pages += batch_size
96
- if progress_callback:
97
- progress_callback(min(processed_pages, total_pages), total_pages)
98
-
99
- pdf.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  extracted_text = "\n\n".join(filter(None, text_chunks))
101
  logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
102
  return extracted_text
@@ -113,7 +151,6 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
113
  return cache[cache_key]
114
 
115
  if file_type == "pdf":
116
- # Try without OCR first, fallback to OCR if empty
117
  text = asyncio.run(extract_all_pages_async(file_path, progress_callback, use_ocr=False))
118
  if not text.strip() or "PDF processing error" in text:
119
  logger.info("Retrying extraction with OCR for %s", file_path)
@@ -158,7 +195,7 @@ def log_system_usage(tag=""):
158
 
159
  def clean_response(text: str) -> str:
160
  text = sanitize_utf8(text)
161
- text = text.replace("[", "").replace("]", "").replace("None", "") # Faster string ops
162
  text = text.replace("\n\n\n", "\n\n")
163
  sections = {}
164
  current_section = None
@@ -171,12 +208,12 @@ def clean_response(text: str) -> str:
171
  current_section = section_match.group(1)
172
  sections.setdefault(current_section, [])
173
  continue
174
- if current_section and line.startswith("- ") and "No issues identified" not in line:
175
  sections[current_section].append(line)
176
  cleaned = [f"### {heading}\n" + "\n".join(findings) for heading, findings in sections.items() if findings]
177
  result = "\n\n".join(cleaned).strip()
178
  logger.debug("Cleaned response length: %d chars", len(result))
179
- return result or ""
180
 
181
  def summarize_findings(combined_response: str) -> str:
182
  if not combined_response or all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
@@ -265,8 +302,10 @@ Patient Record Excerpt (Chunk {0} of {1}):
265
  yield history, None, ""
266
  logger.info("Extracted text length: %d chars", len(extracted))
267
 
268
- chunk_size = 4000 # Increased slightly
269
- chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
 
 
270
  logger.info("Created %d chunks", len(chunks))
271
  combined_response = ""
272
  batch_size = 2
@@ -282,7 +321,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
282
  async def process_chunk(prompt):
283
  chunk_response = ""
284
  for chunk_output in agent.run_gradio_chat(
285
- message=prompt, history=[], temperature=0.2, max_new_tokens=128, max_token=768, call_agent=False, conversation=[]
286
  ):
287
  if chunk_output is None:
288
  continue
@@ -290,12 +329,10 @@ Patient Record Excerpt (Chunk {0} of {1}):
290
  for m in chunk_output:
291
  if hasattr(m, 'content') and m.content:
292
  cleaned = clean_response(m.content)
293
- if cleaned and re.search(r"###\s*\w+", cleaned):
294
- chunk_response += cleaned + "\n\n"
295
  elif isinstance(chunk_output, str) and chunk_output.strip():
296
  cleaned = clean_response(chunk_output)
297
- if cleaned and re.search(r"###\s*\w+", cleaned):
298
- chunk_response += cleaned + "\n\n"
299
  logger.debug("Chunk response length: %d chars", len(chunk_response))
300
  return chunk_response
301
 
@@ -305,7 +342,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
305
  gc.collect()
306
 
307
  for chunk_idx, chunk_response in enumerate(batch_responses, batch_idx + 1):
308
- if chunk_response:
309
  combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
310
  else:
311
  combined_response += f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"
 
16
  from diskcache import Cache
17
  import time
18
  import asyncio
19
+
20
+ # Try importing pypdfium2 and pytesseract, fall back to pdfplumber
21
+ try:
22
+ import pypdfium2 as pdfium
23
+ import pytesseract
24
+ from PIL import Image
25
+ HAS_PYPDFIUM2 = True
26
+ except ImportError:
27
+ HAS_PYPDFIUM2 = False
28
+ import pdfplumber
29
 
30
  # Configure logging and suppress warnings
31
  logging.basicConfig(level=logging.INFO)
 
69
 
70
  async def extract_all_pages_async(file_path: str, progress_callback=None, use_ocr=False) -> str:
71
  try:
72
+ if HAS_PYPDFIUM2:
73
+ pdf = pdfium.PdfDocument(file_path)
74
+ total_pages = len(pdf)
75
+ if total_pages == 0:
76
+ return ""
77
+
78
+ batch_size = 5
79
+ batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
80
+ text_chunks = [""] * total_pages
81
+ processed_pages = 0
82
+
83
+ def extract_batch(start: int, end: int) -> List[tuple]:
84
+ results = []
85
+ for i in range(start, end):
86
+ page = pdf[i]
87
+ text = page.get_textpage().get_text_range() or ""
88
+ if not text.strip() and use_ocr and 'pytesseract' in sys.modules:
89
+ bitmap = page.render(scale=2).to_pil()
90
+ text = pytesseract.image_to_string(bitmap, lang="eng")
91
+ results.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
92
+ return results
93
+
94
+ loop = asyncio.get_event_loop()
95
+ with ThreadPoolExecutor(max_workers=4) as executor:
96
+ futures = [loop.run_in_executor(executor, extract_batch, start, end) for start, end in batches]
97
+ for future in await asyncio.gather(*futures):
98
+ for page_num, text in future:
99
+ text_chunks[page_num] = text
100
+ logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
101
+ processed_pages += batch_size
102
+ if progress_callback:
103
+ progress_callback(min(processed_pages, total_pages), total_pages)
104
+
105
+ pdf.close()
106
+ else:
107
+ # Fallback to pdfplumber
108
+ with pdfplumber.open(file_path) as pdf:
109
+ total_pages = len(pdf.pages)
110
+ if total_pages == 0:
111
+ return ""
112
+
113
+ batch_size = 5
114
+ batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
115
+ text_chunks = [""] * total_pages
116
+ processed_pages = 0
117
+
118
+ def extract_batch(start: int, end: int) -> List[tuple]:
119
+ results = []
120
+ with pdfplumber.open(file_path) as pdf:
121
+ for i in range(start, end):
122
+ page = pdf.pages[i]
123
+ text = page.extract_text() or ""
124
+ results.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
125
+ return results
126
+
127
+ loop = asyncio.get_event_loop()
128
+ with ThreadPoolExecutor(max_workers=4) as executor:
129
+ futures = [loop.run_in_executor(executor, extract_batch, start, end) for start, end in batches]
130
+ for future in await asyncio.gather(*futures):
131
+ for page_num, text in future:
132
+ text_chunks[page_num] = text
133
+ logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
134
+ processed_pages += batch_size
135
+ if progress_callback:
136
+ progress_callback(min(processed_pages, total_pages), total_pages)
137
+
138
  extracted_text = "\n\n".join(filter(None, text_chunks))
139
  logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
140
  return extracted_text
 
151
  return cache[cache_key]
152
 
153
  if file_type == "pdf":
 
154
  text = asyncio.run(extract_all_pages_async(file_path, progress_callback, use_ocr=False))
155
  if not text.strip() or "PDF processing error" in text:
156
  logger.info("Retrying extraction with OCR for %s", file_path)
 
195
 
196
  def clean_response(text: str) -> str:
197
  text = sanitize_utf8(text)
198
+ text = text.replace("[", "").replace("]", "").replace("None", "")
199
  text = text.replace("\n\n\n", "\n\n")
200
  sections = {}
201
  current_section = None
 
208
  current_section = section_match.group(1)
209
  sections.setdefault(current_section, [])
210
  continue
211
+ if current_section and line.startswith("- "):
212
  sections[current_section].append(line)
213
  cleaned = [f"### {heading}\n" + "\n".join(findings) for heading, findings in sections.items() if findings]
214
  result = "\n\n".join(cleaned).strip()
215
  logger.debug("Cleaned response length: %d chars", len(result))
216
+ return result or "No issues identified"
217
 
218
  def summarize_findings(combined_response: str) -> str:
219
  if not combined_response or all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
 
302
  yield history, None, ""
303
  logger.info("Extracted text length: %d chars", len(extracted))
304
 
305
+ chunk_size = 3000 # Adjusted for balance
306
+ chunks = [extracted[i:i + chunk_size] for i in range(0, max(len(extracted), 1), chunk_size)]
307
+ if not chunks:
308
+ chunks = [""] # Ensure at least one chunk
309
  logger.info("Created %d chunks", len(chunks))
310
  combined_response = ""
311
  batch_size = 2
 
321
  async def process_chunk(prompt):
322
  chunk_response = ""
323
  for chunk_output in agent.run_gradio_chat(
324
+ message=prompt, history=[], temperature=0.2, max_new_tokens=256, max_token=1024, call_agent=False, conversation=[]
325
  ):
326
  if chunk_output is None:
327
  continue
 
329
  for m in chunk_output:
330
  if hasattr(m, 'content') and m.content:
331
  cleaned = clean_response(m.content)
332
+ chunk_response += cleaned + "\n\n"
 
333
  elif isinstance(chunk_output, str) and chunk_output.strip():
334
  cleaned = clean_response(chunk_output)
335
+ chunk_response += cleaned + "\n\n"
 
336
  logger.debug("Chunk response length: %d chars", len(chunk_response))
337
  return chunk_response
338
 
 
342
  gc.collect()
343
 
344
  for chunk_idx, chunk_response in enumerate(batch_responses, batch_idx + 1):
345
+ if chunk_response.strip():
346
  combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
347
  else:
348
  combined_response += f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"