Update app.py
Browse files
app.py
CHANGED
@@ -16,10 +16,16 @@ import gc
|
|
16 |
from diskcache import Cache
|
17 |
import time
|
18 |
import asyncio
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
# Configure logging and suppress warnings
|
25 |
logging.basicConfig(level=logging.INFO)
|
@@ -63,40 +69,72 @@ def file_hash(path: str) -> str:
|
|
63 |
|
64 |
async def extract_all_pages_async(file_path: str, progress_callback=None, use_ocr=False) -> str:
|
65 |
try:
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
extracted_text = "\n\n".join(filter(None, text_chunks))
|
101 |
logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
|
102 |
return extracted_text
|
@@ -113,7 +151,6 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
|
|
113 |
return cache[cache_key]
|
114 |
|
115 |
if file_type == "pdf":
|
116 |
-
# Try without OCR first, fallback to OCR if empty
|
117 |
text = asyncio.run(extract_all_pages_async(file_path, progress_callback, use_ocr=False))
|
118 |
if not text.strip() or "PDF processing error" in text:
|
119 |
logger.info("Retrying extraction with OCR for %s", file_path)
|
@@ -158,7 +195,7 @@ def log_system_usage(tag=""):
|
|
158 |
|
159 |
def clean_response(text: str) -> str:
|
160 |
text = sanitize_utf8(text)
|
161 |
-
text = text.replace("[", "").replace("]", "").replace("None", "")
|
162 |
text = text.replace("\n\n\n", "\n\n")
|
163 |
sections = {}
|
164 |
current_section = None
|
@@ -171,12 +208,12 @@ def clean_response(text: str) -> str:
|
|
171 |
current_section = section_match.group(1)
|
172 |
sections.setdefault(current_section, [])
|
173 |
continue
|
174 |
-
if current_section and line.startswith("- ")
|
175 |
sections[current_section].append(line)
|
176 |
cleaned = [f"### {heading}\n" + "\n".join(findings) for heading, findings in sections.items() if findings]
|
177 |
result = "\n\n".join(cleaned).strip()
|
178 |
logger.debug("Cleaned response length: %d chars", len(result))
|
179 |
-
return result or ""
|
180 |
|
181 |
def summarize_findings(combined_response: str) -> str:
|
182 |
if not combined_response or all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
|
@@ -265,8 +302,10 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
265 |
yield history, None, ""
|
266 |
logger.info("Extracted text length: %d chars", len(extracted))
|
267 |
|
268 |
-
chunk_size =
|
269 |
-
chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
|
|
|
|
|
270 |
logger.info("Created %d chunks", len(chunks))
|
271 |
combined_response = ""
|
272 |
batch_size = 2
|
@@ -282,7 +321,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
282 |
async def process_chunk(prompt):
|
283 |
chunk_response = ""
|
284 |
for chunk_output in agent.run_gradio_chat(
|
285 |
-
message=prompt, history=[], temperature=0.2, max_new_tokens=
|
286 |
):
|
287 |
if chunk_output is None:
|
288 |
continue
|
@@ -290,12 +329,10 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
290 |
for m in chunk_output:
|
291 |
if hasattr(m, 'content') and m.content:
|
292 |
cleaned = clean_response(m.content)
|
293 |
-
|
294 |
-
chunk_response += cleaned + "\n\n"
|
295 |
elif isinstance(chunk_output, str) and chunk_output.strip():
|
296 |
cleaned = clean_response(chunk_output)
|
297 |
-
|
298 |
-
chunk_response += cleaned + "\n\n"
|
299 |
logger.debug("Chunk response length: %d chars", len(chunk_response))
|
300 |
return chunk_response
|
301 |
|
@@ -305,7 +342,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
305 |
gc.collect()
|
306 |
|
307 |
for chunk_idx, chunk_response in enumerate(batch_responses, batch_idx + 1):
|
308 |
-
if chunk_response:
|
309 |
combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
|
310 |
else:
|
311 |
combined_response += f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"
|
|
|
16 |
from diskcache import Cache
|
17 |
import time
|
18 |
import asyncio
|
19 |
+
|
20 |
+
# Try importing pypdfium2 and pytesseract, fall back to pdfplumber
|
21 |
+
try:
|
22 |
+
import pypdfium2 as pdfium
|
23 |
+
import pytesseract
|
24 |
+
from PIL import Image
|
25 |
+
HAS_PYPDFIUM2 = True
|
26 |
+
except ImportError:
|
27 |
+
HAS_PYPDFIUM2 = False
|
28 |
+
import pdfplumber
|
29 |
|
30 |
# Configure logging and suppress warnings
|
31 |
logging.basicConfig(level=logging.INFO)
|
|
|
69 |
|
70 |
async def extract_all_pages_async(file_path: str, progress_callback=None, use_ocr=False) -> str:
|
71 |
try:
|
72 |
+
if HAS_PYPDFIUM2:
|
73 |
+
pdf = pdfium.PdfDocument(file_path)
|
74 |
+
total_pages = len(pdf)
|
75 |
+
if total_pages == 0:
|
76 |
+
return ""
|
77 |
+
|
78 |
+
batch_size = 5
|
79 |
+
batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
|
80 |
+
text_chunks = [""] * total_pages
|
81 |
+
processed_pages = 0
|
82 |
+
|
83 |
+
def extract_batch(start: int, end: int) -> List[tuple]:
|
84 |
+
results = []
|
85 |
+
for i in range(start, end):
|
86 |
+
page = pdf[i]
|
87 |
+
text = page.get_textpage().get_text_range() or ""
|
88 |
+
if not text.strip() and use_ocr and 'pytesseract' in sys.modules:
|
89 |
+
bitmap = page.render(scale=2).to_pil()
|
90 |
+
text = pytesseract.image_to_string(bitmap, lang="eng")
|
91 |
+
results.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
|
92 |
+
return results
|
93 |
+
|
94 |
+
loop = asyncio.get_event_loop()
|
95 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
96 |
+
futures = [loop.run_in_executor(executor, extract_batch, start, end) for start, end in batches]
|
97 |
+
for future in await asyncio.gather(*futures):
|
98 |
+
for page_num, text in future:
|
99 |
+
text_chunks[page_num] = text
|
100 |
+
logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
|
101 |
+
processed_pages += batch_size
|
102 |
+
if progress_callback:
|
103 |
+
progress_callback(min(processed_pages, total_pages), total_pages)
|
104 |
+
|
105 |
+
pdf.close()
|
106 |
+
else:
|
107 |
+
# Fallback to pdfplumber
|
108 |
+
with pdfplumber.open(file_path) as pdf:
|
109 |
+
total_pages = len(pdf.pages)
|
110 |
+
if total_pages == 0:
|
111 |
+
return ""
|
112 |
+
|
113 |
+
batch_size = 5
|
114 |
+
batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
|
115 |
+
text_chunks = [""] * total_pages
|
116 |
+
processed_pages = 0
|
117 |
+
|
118 |
+
def extract_batch(start: int, end: int) -> List[tuple]:
|
119 |
+
results = []
|
120 |
+
with pdfplumber.open(file_path) as pdf:
|
121 |
+
for i in range(start, end):
|
122 |
+
page = pdf.pages[i]
|
123 |
+
text = page.extract_text() or ""
|
124 |
+
results.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
|
125 |
+
return results
|
126 |
+
|
127 |
+
loop = asyncio.get_event_loop()
|
128 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
129 |
+
futures = [loop.run_in_executor(executor, extract_batch, start, end) for start, end in batches]
|
130 |
+
for future in await asyncio.gather(*futures):
|
131 |
+
for page_num, text in future:
|
132 |
+
text_chunks[page_num] = text
|
133 |
+
logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
|
134 |
+
processed_pages += batch_size
|
135 |
+
if progress_callback:
|
136 |
+
progress_callback(min(processed_pages, total_pages), total_pages)
|
137 |
+
|
138 |
extracted_text = "\n\n".join(filter(None, text_chunks))
|
139 |
logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
|
140 |
return extracted_text
|
|
|
151 |
return cache[cache_key]
|
152 |
|
153 |
if file_type == "pdf":
|
|
|
154 |
text = asyncio.run(extract_all_pages_async(file_path, progress_callback, use_ocr=False))
|
155 |
if not text.strip() or "PDF processing error" in text:
|
156 |
logger.info("Retrying extraction with OCR for %s", file_path)
|
|
|
195 |
|
196 |
def clean_response(text: str) -> str:
|
197 |
text = sanitize_utf8(text)
|
198 |
+
text = text.replace("[", "").replace("]", "").replace("None", "")
|
199 |
text = text.replace("\n\n\n", "\n\n")
|
200 |
sections = {}
|
201 |
current_section = None
|
|
|
208 |
current_section = section_match.group(1)
|
209 |
sections.setdefault(current_section, [])
|
210 |
continue
|
211 |
+
if current_section and line.startswith("- "):
|
212 |
sections[current_section].append(line)
|
213 |
cleaned = [f"### {heading}\n" + "\n".join(findings) for heading, findings in sections.items() if findings]
|
214 |
result = "\n\n".join(cleaned).strip()
|
215 |
logger.debug("Cleaned response length: %d chars", len(result))
|
216 |
+
return result or "No issues identified"
|
217 |
|
218 |
def summarize_findings(combined_response: str) -> str:
|
219 |
if not combined_response or all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
|
|
|
302 |
yield history, None, ""
|
303 |
logger.info("Extracted text length: %d chars", len(extracted))
|
304 |
|
305 |
+
chunk_size = 3000 # Adjusted for balance
|
306 |
+
chunks = [extracted[i:i + chunk_size] for i in range(0, max(len(extracted), 1), chunk_size)]
|
307 |
+
if not chunks:
|
308 |
+
chunks = [""] # Ensure at least one chunk
|
309 |
logger.info("Created %d chunks", len(chunks))
|
310 |
combined_response = ""
|
311 |
batch_size = 2
|
|
|
321 |
async def process_chunk(prompt):
|
322 |
chunk_response = ""
|
323 |
for chunk_output in agent.run_gradio_chat(
|
324 |
+
message=prompt, history=[], temperature=0.2, max_new_tokens=256, max_token=1024, call_agent=False, conversation=[]
|
325 |
):
|
326 |
if chunk_output is None:
|
327 |
continue
|
|
|
329 |
for m in chunk_output:
|
330 |
if hasattr(m, 'content') and m.content:
|
331 |
cleaned = clean_response(m.content)
|
332 |
+
chunk_response += cleaned + "\n\n"
|
|
|
333 |
elif isinstance(chunk_output, str) and chunk_output.strip():
|
334 |
cleaned = clean_response(chunk_output)
|
335 |
+
chunk_response += cleaned + "\n\n"
|
|
|
336 |
logger.debug("Chunk response length: %d chars", len(chunk_response))
|
337 |
return chunk_response
|
338 |
|
|
|
342 |
gc.collect()
|
343 |
|
344 |
for chunk_idx, chunk_response in enumerate(batch_responses, batch_idx + 1):
|
345 |
+
if chunk_response.strip():
|
346 |
combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
|
347 |
else:
|
348 |
combined_response += f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"
|