Ali2206 commited on
Commit
8d797c3
·
verified ·
1 Parent(s): d37093e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -31
app.py CHANGED
@@ -73,51 +73,59 @@ async def extract_all_pages_async(file_path: str, progress_callback=None, force_
73
  total_pages = 0
74
  text_chunks = []
75
 
76
- if HAS_PYPDFIUM2:
77
- pdf = pdfium.PdfDocument(file_path)
78
- total_pages = len(pdf)
79
  if total_pages == 0:
 
80
  return ""
81
 
82
  def extract_page(i):
83
- page = pdf[i]
84
- text = page.get_textpage().get_text_range() or ""
85
- if (not text.strip() or len(text) < 100) and force_ocr and 'pytesseract' in sys.modules:
86
- logger.info("Falling back to OCR for page %d", i + 1)
87
- bitmap = page.render(scale=2).to_pil()
88
- text = pytesseract.image_to_string(bitmap, lang="eng")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  return (i, f"=== Page {i + 1} ===\n{text.strip()}")
90
 
91
  with ThreadPoolExecutor(max_workers=4) as executor:
92
  futures = [executor.submit(extract_page, i) for i in range(total_pages)]
93
- for future in as_completed(futures):
94
  page_num, text = future.result()
95
  text_chunks.append((page_num, text))
96
  logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
97
  if progress_callback:
98
  progress_callback(page_num + 1, total_pages)
99
 
100
- text_chunks.sort(key=lambda x: x[0])
101
- extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
102
- pdf.close()
103
- else:
104
- with pdfplumber.open(file_path) as pdf:
105
- total_pages = len(pdf.pages)
106
- if total_pages == 0:
107
- return ""
108
-
109
- for i, page in enumerate(pdf.pages):
110
- text = page.extract_text() or ""
111
- text_chunks.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
112
- logger.debug("Page %d extracted: %s...", i + 1, text[:50])
113
- if progress_callback:
114
- progress_callback(i + 1, total_pages)
115
-
116
- extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
117
-
118
  logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
119
  if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
120
- logger.info("Text too short, retrying with OCR")
121
  return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
122
  return extracted_text
123
  except Exception as e:
@@ -276,6 +284,7 @@ Patient Record Excerpt:
276
  """
277
 
278
  async def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
 
279
  history.append({"role": "user", "content": message})
280
  yield history, None, ""
281
 
@@ -288,7 +297,7 @@ Patient Record Excerpt:
288
 
289
  futures = [convert_file_to_json(f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
290
  results = [sanitize_utf8(future) for future in futures]
291
- extracted = "\n".join(results)
292
  file_hash_value = file_hash(files[0].name) if files else ""
293
 
294
  history.append({"role": "assistant", "content": "✅ Text extraction complete."})
@@ -329,7 +338,7 @@ Patient Record Excerpt:
329
  raw_outputs.append(chunk_output)
330
  cleaned = clean_response(chunk_output)
331
  chunk_response += cleaned + "\n\n"
332
- logger.debug("Raw outputs: %s", raw_outputs[:100])
333
  logger.debug("Chunk response length: %d chars", len(chunk_response))
334
  return chunk_response
335
 
@@ -348,11 +357,13 @@ Patient Record Excerpt:
348
  with open(report_path, "w", encoding="utf-8") as f:
349
  f.write(summary)
350
  yield history, report_path if report_path and os.path.exists(report_path) else None, summary
 
351
 
352
  except Exception as e:
353
  logger.error("Analysis error: %s", e)
354
  history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
355
  yield history, None, f"### Comprehensive Clinical Oversight Summary\nError occurred during analysis: {str(e)}"
 
356
 
357
  send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
358
  msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
 
73
  total_pages = 0
74
  text_chunks = []
75
 
76
+ with pdfplumber.open(file_path) as pdf:
77
+ total_pages = len(pdf.pages)
 
78
  if total_pages == 0:
79
+ logger.error("No pages found in PDF")
80
  return ""
81
 
82
  def extract_page(i):
83
+ page = pdf.pages[i]
84
+ # Try table extraction first
85
+ text = ""
86
+ tables = page.extract_tables()
87
+ if tables:
88
+ for table in tables:
89
+ # Mimic Excel/CSV: join non-None cells as strings
90
+ table_text = "\n".join(
91
+ " | ".join(str(cell) if cell is not None else "" for cell in row)
92
+ for row in table
93
+ )
94
+ text += table_text + "\n\n"
95
+ logger.debug("Page %d extracted %d tables, text length: %d chars", i + 1, len(tables), len(text))
96
+ else:
97
+ # Fall back to raw text
98
+ text = page.extract_text() or ""
99
+ logger.debug("Page %d no tables, raw text length: %d chars", i + 1, len(text))
100
+
101
+ # OCR if text is short or force_ocr is True
102
+ if (not text.strip() or len(text) < 100 or force_ocr) and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
103
+ try:
104
+ logger.info("Attempting OCR for page %d", i + 1)
105
+ pdfium_pdf = pdfium.PdfDocument(file_path)
106
+ page_bitmap = pdfium_pdf[i].render(scale=2).to_pil()
107
+ text = pytesseract.image_to_string(page_bitmap, lang="eng")
108
+ logger.debug("Page %d OCR text length: %d chars", i + 1, len(text))
109
+ pdfium_pdf.close()
110
+ except Exception as e:
111
+ logger.error("OCR failed for page %d: %s", i + 1, e)
112
+ text = text or ""
113
  return (i, f"=== Page {i + 1} ===\n{text.strip()}")
114
 
115
  with ThreadPoolExecutor(max_workers=4) as executor:
116
  futures = [executor.submit(extract_page, i) for i in range(total_pages)]
117
+ for future in futures:
118
  page_num, text = future.result()
119
  text_chunks.append((page_num, text))
120
  logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
121
  if progress_callback:
122
  progress_callback(page_num + 1, total_pages)
123
 
124
+ text_chunks.sort(key=lambda x: x[0])
125
+ extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
127
  if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
128
+ logger.info("Text too short, forcing OCR for all pages")
129
  return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
130
  return extracted_text
131
  except Exception as e:
 
284
  """
285
 
286
  async def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
287
+ start_time = time.time()
288
  history.append({"role": "user", "content": message})
289
  yield history, None, ""
290
 
 
297
 
298
  futures = [convert_file_to_json(f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
299
  results = [sanitize_utf8(future) for future in futures]
300
+ extracted = "\n".join([json.loads(r).get("content", "") for r in results if "content" in json.loads(r)])
301
  file_hash_value = file_hash(files[0].name) if files else ""
302
 
303
  history.append({"role": "assistant", "content": "✅ Text extraction complete."})
 
338
  raw_outputs.append(chunk_output)
339
  cleaned = clean_response(chunk_output)
340
  chunk_response += cleaned + "\n\n"
341
+ logger.debug("Raw outputs for chunk: %s", raw_outputs[:100])
342
  logger.debug("Chunk response length: %d chars", len(chunk_response))
343
  return chunk_response
344
 
 
357
  with open(report_path, "w", encoding="utf-8") as f:
358
  f.write(summary)
359
  yield history, report_path if report_path and os.path.exists(report_path) else None, summary
360
+ logger.info("Analysis took %.2f seconds", time.time() - start_time)
361
 
362
  except Exception as e:
363
  logger.error("Analysis error: %s", e)
364
  history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
365
  yield history, None, f"### Comprehensive Clinical Oversight Summary\nError occurred during analysis: {str(e)}"
366
+ logger.info("Analysis took %.2f seconds", time.time() - start_time)
367
 
368
  send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
369
  msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])