Update app.py
Browse files
app.py
CHANGED
@@ -73,51 +73,59 @@ async def extract_all_pages_async(file_path: str, progress_callback=None, force_
|
|
73 |
total_pages = 0
|
74 |
text_chunks = []
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
total_pages = len(pdf)
|
79 |
if total_pages == 0:
|
|
|
80 |
return ""
|
81 |
|
82 |
def extract_page(i):
|
83 |
-
page = pdf[i]
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
return (i, f"=== Page {i + 1} ===\n{text.strip()}")
|
90 |
|
91 |
with ThreadPoolExecutor(max_workers=4) as executor:
|
92 |
futures = [executor.submit(extract_page, i) for i in range(total_pages)]
|
93 |
-
for future in
|
94 |
page_num, text = future.result()
|
95 |
text_chunks.append((page_num, text))
|
96 |
logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
|
97 |
if progress_callback:
|
98 |
progress_callback(page_num + 1, total_pages)
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
pdf.close()
|
103 |
-
else:
|
104 |
-
with pdfplumber.open(file_path) as pdf:
|
105 |
-
total_pages = len(pdf.pages)
|
106 |
-
if total_pages == 0:
|
107 |
-
return ""
|
108 |
-
|
109 |
-
for i, page in enumerate(pdf.pages):
|
110 |
-
text = page.extract_text() or ""
|
111 |
-
text_chunks.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
|
112 |
-
logger.debug("Page %d extracted: %s...", i + 1, text[:50])
|
113 |
-
if progress_callback:
|
114 |
-
progress_callback(i + 1, total_pages)
|
115 |
-
|
116 |
-
extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
|
117 |
-
|
118 |
logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
|
119 |
if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
|
120 |
-
logger.info("Text too short,
|
121 |
return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
|
122 |
return extracted_text
|
123 |
except Exception as e:
|
@@ -276,6 +284,7 @@ Patient Record Excerpt:
|
|
276 |
"""
|
277 |
|
278 |
async def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
|
|
|
279 |
history.append({"role": "user", "content": message})
|
280 |
yield history, None, ""
|
281 |
|
@@ -288,7 +297,7 @@ Patient Record Excerpt:
|
|
288 |
|
289 |
futures = [convert_file_to_json(f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
|
290 |
results = [sanitize_utf8(future) for future in futures]
|
291 |
-
extracted = "\n".join(results)
|
292 |
file_hash_value = file_hash(files[0].name) if files else ""
|
293 |
|
294 |
history.append({"role": "assistant", "content": "✅ Text extraction complete."})
|
@@ -329,7 +338,7 @@ Patient Record Excerpt:
|
|
329 |
raw_outputs.append(chunk_output)
|
330 |
cleaned = clean_response(chunk_output)
|
331 |
chunk_response += cleaned + "\n\n"
|
332 |
-
logger.debug("Raw outputs: %s", raw_outputs[:100])
|
333 |
logger.debug("Chunk response length: %d chars", len(chunk_response))
|
334 |
return chunk_response
|
335 |
|
@@ -348,11 +357,13 @@ Patient Record Excerpt:
|
|
348 |
with open(report_path, "w", encoding="utf-8") as f:
|
349 |
f.write(summary)
|
350 |
yield history, report_path if report_path and os.path.exists(report_path) else None, summary
|
|
|
351 |
|
352 |
except Exception as e:
|
353 |
logger.error("Analysis error: %s", e)
|
354 |
history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
|
355 |
yield history, None, f"### Comprehensive Clinical Oversight Summary\nError occurred during analysis: {str(e)}"
|
|
|
356 |
|
357 |
send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
|
358 |
msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
|
|
|
73 |
total_pages = 0
|
74 |
text_chunks = []
|
75 |
|
76 |
+
with pdfplumber.open(file_path) as pdf:
|
77 |
+
total_pages = len(pdf.pages)
|
|
|
78 |
if total_pages == 0:
|
79 |
+
logger.error("No pages found in PDF")
|
80 |
return ""
|
81 |
|
82 |
def extract_page(i):
|
83 |
+
page = pdf.pages[i]
|
84 |
+
# Try table extraction first
|
85 |
+
text = ""
|
86 |
+
tables = page.extract_tables()
|
87 |
+
if tables:
|
88 |
+
for table in tables:
|
89 |
+
# Mimic Excel/CSV: join non-None cells as strings
|
90 |
+
table_text = "\n".join(
|
91 |
+
" | ".join(str(cell) if cell is not None else "" for cell in row)
|
92 |
+
for row in table
|
93 |
+
)
|
94 |
+
text += table_text + "\n\n"
|
95 |
+
logger.debug("Page %d extracted %d tables, text length: %d chars", i + 1, len(tables), len(text))
|
96 |
+
else:
|
97 |
+
# Fall back to raw text
|
98 |
+
text = page.extract_text() or ""
|
99 |
+
logger.debug("Page %d no tables, raw text length: %d chars", i + 1, len(text))
|
100 |
+
|
101 |
+
# OCR if text is short or force_ocr is True
|
102 |
+
if (not text.strip() or len(text) < 100 or force_ocr) and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
|
103 |
+
try:
|
104 |
+
logger.info("Attempting OCR for page %d", i + 1)
|
105 |
+
pdfium_pdf = pdfium.PdfDocument(file_path)
|
106 |
+
page_bitmap = pdfium_pdf[i].render(scale=2).to_pil()
|
107 |
+
text = pytesseract.image_to_string(page_bitmap, lang="eng")
|
108 |
+
logger.debug("Page %d OCR text length: %d chars", i + 1, len(text))
|
109 |
+
pdfium_pdf.close()
|
110 |
+
except Exception as e:
|
111 |
+
logger.error("OCR failed for page %d: %s", i + 1, e)
|
112 |
+
text = text or ""
|
113 |
return (i, f"=== Page {i + 1} ===\n{text.strip()}")
|
114 |
|
115 |
with ThreadPoolExecutor(max_workers=4) as executor:
|
116 |
futures = [executor.submit(extract_page, i) for i in range(total_pages)]
|
117 |
+
for future in futures:
|
118 |
page_num, text = future.result()
|
119 |
text_chunks.append((page_num, text))
|
120 |
logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
|
121 |
if progress_callback:
|
122 |
progress_callback(page_num + 1, total_pages)
|
123 |
|
124 |
+
text_chunks.sort(key=lambda x: x[0])
|
125 |
+
extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
|
127 |
if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
|
128 |
+
logger.info("Text too short, forcing OCR for all pages")
|
129 |
return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
|
130 |
return extracted_text
|
131 |
except Exception as e:
|
|
|
284 |
"""
|
285 |
|
286 |
async def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
|
287 |
+
start_time = time.time()
|
288 |
history.append({"role": "user", "content": message})
|
289 |
yield history, None, ""
|
290 |
|
|
|
297 |
|
298 |
futures = [convert_file_to_json(f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
|
299 |
results = [sanitize_utf8(future) for future in futures]
|
300 |
+
extracted = "\n".join([json.loads(r).get("content", "") for r in results if "content" in json.loads(r)])
|
301 |
file_hash_value = file_hash(files[0].name) if files else ""
|
302 |
|
303 |
history.append({"role": "assistant", "content": "✅ Text extraction complete."})
|
|
|
338 |
raw_outputs.append(chunk_output)
|
339 |
cleaned = clean_response(chunk_output)
|
340 |
chunk_response += cleaned + "\n\n"
|
341 |
+
logger.debug("Raw outputs for chunk: %s", raw_outputs[:100])
|
342 |
logger.debug("Chunk response length: %d chars", len(chunk_response))
|
343 |
return chunk_response
|
344 |
|
|
|
357 |
with open(report_path, "w", encoding="utf-8") as f:
|
358 |
f.write(summary)
|
359 |
yield history, report_path if report_path and os.path.exists(report_path) else None, summary
|
360 |
+
logger.info("Analysis took %.2f seconds", time.time() - start_time)
|
361 |
|
362 |
except Exception as e:
|
363 |
logger.error("Analysis error: %s", e)
|
364 |
history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
|
365 |
yield history, None, f"### Comprehensive Clinical Oversight Summary\nError occurred during analysis: {str(e)}"
|
366 |
+
logger.info("Analysis took %.2f seconds", time.time() - start_time)
|
367 |
|
368 |
send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
|
369 |
msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
|