CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 18

Commit

8d797c3

verified ·

1 Parent(s): d37093e

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -31

app.py CHANGED Viewed

@@ -73,51 +73,59 @@ async def extract_all_pages_async(file_path: str, progress_callback=None, force_
         total_pages = 0
         text_chunks = []
-        if HAS_PYPDFIUM2:
-            pdf = pdfium.PdfDocument(file_path)
-            total_pages = len(pdf)
             if total_pages == 0:
                 return ""
             def extract_page(i):
-                page = pdf[i]
-                text = page.get_textpage().get_text_range() or ""
-                if (not text.strip() or len(text) < 100) and force_ocr and 'pytesseract' in sys.modules:
-                    logger.info("Falling back to OCR for page %d", i + 1)
-                    bitmap = page.render(scale=2).to_pil()
-                    text = pytesseract.image_to_string(bitmap, lang="eng")
                 return (i, f"=== Page {i + 1} ===\n{text.strip()}")
             with ThreadPoolExecutor(max_workers=4) as executor:
                 futures = [executor.submit(extract_page, i) for i in range(total_pages)]
-                for future in as_completed(futures):
                     page_num, text = future.result()
                     text_chunks.append((page_num, text))
                     logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
                     if progress_callback:
                         progress_callback(page_num + 1, total_pages)
-            text_chunks.sort(key=lambda x: x[0])
-            extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
-            pdf.close()
-        else:
-            with pdfplumber.open(file_path) as pdf:
-                total_pages = len(pdf.pages)
-                if total_pages == 0:
-                    return ""
-                for i, page in enumerate(pdf.pages):
-                    text = page.extract_text() or ""
-                    text_chunks.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
-                    logger.debug("Page %d extracted: %s...", i + 1, text[:50])
-                    if progress_callback:
-                        progress_callback(i + 1, total_pages)
-            extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
         logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
         if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
-            logger.info("Text too short, retrying with OCR")
             return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
         return extracted_text
     except Exception as e:
@@ -276,6 +284,7 @@ Patient Record Excerpt:
 """
         async def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
             history.append({"role": "user", "content": message})
             yield history, None, ""
@@ -288,7 +297,7 @@ Patient Record Excerpt:
                 futures = [convert_file_to_json(f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
                 results = [sanitize_utf8(future) for future in futures]
-                extracted = "\n".join(results)
                 file_hash_value = file_hash(files[0].name) if files else ""
                 history.append({"role": "assistant", "content": "✅ Text extraction complete."})
@@ -329,7 +338,7 @@ Patient Record Excerpt:
                                 raw_outputs.append(chunk_output)
                                 cleaned = clean_response(chunk_output)
                                 chunk_response += cleaned + "\n\n"
-                        logger.debug("Raw outputs: %s", raw_outputs[:100])
                         logger.debug("Chunk response length: %d chars", len(chunk_response))
                         return chunk_response
@@ -348,11 +357,13 @@ Patient Record Excerpt:
                     with open(report_path, "w", encoding="utf-8") as f:
                         f.write(summary)
                 yield history, report_path if report_path and os.path.exists(report_path) else None, summary
             except Exception as e:
                 logger.error("Analysis error: %s", e)
                 history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
                 yield history, None, f"### Comprehensive Clinical Oversight Summary\nError occurred during analysis: {str(e)}"
         send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
         msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])

         total_pages = 0
         text_chunks = []
+        with pdfplumber.open(file_path) as pdf:
+            total_pages = len(pdf.pages)
             if total_pages == 0:
+                logger.error("No pages found in PDF")
                 return ""
             def extract_page(i):
+                page = pdf.pages[i]
+                # Try table extraction first
+                text = ""
+                tables = page.extract_tables()
+                if tables:
+                    for table in tables:
+                        # Mimic Excel/CSV: join non-None cells as strings
+                        table_text = "\n".join(
+                            " | ".join(str(cell) if cell is not None else "" for cell in row)
+                            for row in table
+                        )
+                        text += table_text + "\n\n"
+                    logger.debug("Page %d extracted %d tables, text length: %d chars", i + 1, len(tables), len(text))
+                else:
+                    # Fall back to raw text
+                    text = page.extract_text() or ""
+                    logger.debug("Page %d no tables, raw text length: %d chars", i + 1, len(text))
+                # OCR if text is short or force_ocr is True
+                if (not text.strip() or len(text) < 100 or force_ocr) and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
+                    try:
+                        logger.info("Attempting OCR for page %d", i + 1)
+                        pdfium_pdf = pdfium.PdfDocument(file_path)
+                        page_bitmap = pdfium_pdf[i].render(scale=2).to_pil()
+                        text = pytesseract.image_to_string(page_bitmap, lang="eng")
+                        logger.debug("Page %d OCR text length: %d chars", i + 1, len(text))
+                        pdfium_pdf.close()
+                    except Exception as e:
+                        logger.error("OCR failed for page %d: %s", i + 1, e)
+                        text = text or ""
                 return (i, f"=== Page {i + 1} ===\n{text.strip()}")
             with ThreadPoolExecutor(max_workers=4) as executor:
                 futures = [executor.submit(extract_page, i) for i in range(total_pages)]
+                for future in futures:
                     page_num, text = future.result()
                     text_chunks.append((page_num, text))
                     logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
                     if progress_callback:
                         progress_callback(page_num + 1, total_pages)
+        text_chunks.sort(key=lambda x: x[0])
+        extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
         logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
         if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
+            logger.info("Text too short, forcing OCR for all pages")
             return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
         return extracted_text
     except Exception as e:
 """
         async def analyze(message: str, history: List[dict], files: List, progress=gr.Progress()):
+            start_time = time.time()
             history.append({"role": "user", "content": message})
             yield history, None, ""
                 futures = [convert_file_to_json(f.name, f.name.split(".")[-1].lower(), update_extraction_progress) for f in files]
                 results = [sanitize_utf8(future) for future in futures]
+                extracted = "\n".join([json.loads(r).get("content", "") for r in results if "content" in json.loads(r)])
                 file_hash_value = file_hash(files[0].name) if files else ""
                 history.append({"role": "assistant", "content": "✅ Text extraction complete."})
                                 raw_outputs.append(chunk_output)
                                 cleaned = clean_response(chunk_output)
                                 chunk_response += cleaned + "\n\n"
+                        logger.debug("Raw outputs for chunk: %s", raw_outputs[:100])
                         logger.debug("Chunk response length: %d chars", len(chunk_response))
                         return chunk_response
                     with open(report_path, "w", encoding="utf-8") as f:
                         f.write(summary)
                 yield history, report_path if report_path and os.path.exists(report_path) else None, summary
+                logger.info("Analysis took %.2f seconds", time.time() - start_time)
             except Exception as e:
                 logger.error("Analysis error: %s", e)
                 history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
                 yield history, None, f"### Comprehensive Clinical Oversight Summary\nError occurred during analysis: {str(e)}"
+                logger.info("Analysis took %.2f seconds", time.time() - start_time)
         send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])
         msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output, final_summary])