CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 18

Commit

5eb9bf1

verified ·

1 Parent(s): c0b195c

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -50

app.py CHANGED Viewed

@@ -16,10 +16,16 @@ import gc
 from diskcache import Cache
 import time
 import asyncio
-import pypdfium2 as pdfium
-import pytesseract
-from PIL import Image
-import io
 # Configure logging and suppress warnings
 logging.basicConfig(level=logging.INFO)
@@ -63,40 +69,72 @@ def file_hash(path: str) -> str:
 async def extract_all_pages_async(file_path: str, progress_callback=None, use_ocr=False) -> str:
     try:
-        pdf = pdfium.PdfDocument(file_path)
-        total_pages = len(pdf)
-        if total_pages == 0:
-            return ""
-        batch_size = 5
-        batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
-        text_chunks = [""] * total_pages
-        processed_pages = 0
-        def extract_batch(start: int, end: int) -> List[tuple]:
-            results = []
-            for i in range(start, end):
-                page = pdf[i]
-                text = page.get_textpage().get_text_range() or ""
-                if not text.strip() and use_ocr:
-                    # Fallback to OCR
-                    bitmap = page.render(scale=2).to_pil()
-                    text = pytesseract.image_to_string(bitmap, lang="eng")
-                results.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
-            return results
-        loop = asyncio.get_event_loop()
-        with ThreadPoolExecutor(max_workers=4) as executor:
-            futures = [loop.run_in_executor(executor, extract_batch, start, end) for start, end in batches]
-            for future in await asyncio.gather(*futures):
-                for page_num, text in future:
-                    text_chunks[page_num] = text
-                    logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
-                processed_pages += batch_size
-                if progress_callback:
-                    progress_callback(min(processed_pages, total_pages), total_pages)
-        pdf.close()
         extracted_text = "\n\n".join(filter(None, text_chunks))
         logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
         return extracted_text
@@ -113,7 +151,6 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
             return cache[cache_key]
         if file_type == "pdf":
-            # Try without OCR first, fallback to OCR if empty
             text = asyncio.run(extract_all_pages_async(file_path, progress_callback, use_ocr=False))
             if not text.strip() or "PDF processing error" in text:
                 logger.info("Retrying extraction with OCR for %s", file_path)
@@ -158,7 +195,7 @@ def log_system_usage(tag=""):
 def clean_response(text: str) -> str:
     text = sanitize_utf8(text)
-    text = text.replace("[", "").replace("]", "").replace("None", "")  # Faster string ops
     text = text.replace("\n\n\n", "\n\n")
     sections = {}
     current_section = None
@@ -171,12 +208,12 @@ def clean_response(text: str) -> str:
             current_section = section_match.group(1)
             sections.setdefault(current_section, [])
             continue
-        if current_section and line.startswith("- ") and "No issues identified" not in line:
             sections[current_section].append(line)
     cleaned = [f"### {heading}\n" + "\n".join(findings) for heading, findings in sections.items() if findings]
     result = "\n\n".join(cleaned).strip()
     logger.debug("Cleaned response length: %d chars", len(result))
-    return result or ""
 def summarize_findings(combined_response: str) -> str:
     if not combined_response or all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
@@ -265,8 +302,10 @@ Patient Record Excerpt (Chunk {0} of {1}):
                 yield history, None, ""
                 logger.info("Extracted text length: %d chars", len(extracted))
-            chunk_size = 4000  # Increased slightly
-            chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
             logger.info("Created %d chunks", len(chunks))
             combined_response = ""
             batch_size = 2
@@ -282,7 +321,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
                     async def process_chunk(prompt):
                         chunk_response = ""
                         for chunk_output in agent.run_gradio_chat(
-                            message=prompt, history=[], temperature=0.2, max_new_tokens=128, max_token=768, call_agent=False, conversation=[]
                         ):
                             if chunk_output is None:
                                 continue
@@ -290,12 +329,10 @@ Patient Record Excerpt (Chunk {0} of {1}):
                                 for m in chunk_output:
                                     if hasattr(m, 'content') and m.content:
                                         cleaned = clean_response(m.content)
-                                        if cleaned and re.search(r"###\s*\w+", cleaned):
-                                            chunk_response += cleaned + "\n\n"
                             elif isinstance(chunk_output, str) and chunk_output.strip():
                                 cleaned = clean_response(chunk_output)
-                                if cleaned and re.search(r"###\s*\w+", cleaned):
-                                    chunk_response += cleaned + "\n\n"
                         logger.debug("Chunk response length: %d chars", len(chunk_response))
                         return chunk_response
@@ -305,7 +342,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
                     gc.collect()
                     for chunk_idx, chunk_response in enumerate(batch_responses, batch_idx + 1):
-                        if chunk_response:
                             combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
                         else:
                             combined_response += f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"

 from diskcache import Cache
 import time
 import asyncio
+# Try importing pypdfium2 and pytesseract, fall back to pdfplumber
+try:
+    import pypdfium2 as pdfium
+    import pytesseract
+    from PIL import Image
+    HAS_PYPDFIUM2 = True
+except ImportError:
+    HAS_PYPDFIUM2 = False
+    import pdfplumber
 # Configure logging and suppress warnings
 logging.basicConfig(level=logging.INFO)
 async def extract_all_pages_async(file_path: str, progress_callback=None, use_ocr=False) -> str:
     try:
+        if HAS_PYPDFIUM2:
+            pdf = pdfium.PdfDocument(file_path)
+            total_pages = len(pdf)
+            if total_pages == 0:
+                return ""
+            batch_size = 5
+            batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
+            text_chunks = [""] * total_pages
+            processed_pages = 0
+            def extract_batch(start: int, end: int) -> List[tuple]:
+                results = []
+                for i in range(start, end):
+                    page = pdf[i]
+                    text = page.get_textpage().get_text_range() or ""
+                    if not text.strip() and use_ocr and 'pytesseract' in sys.modules:
+                        bitmap = page.render(scale=2).to_pil()
+                        text = pytesseract.image_to_string(bitmap, lang="eng")
+                    results.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
+                return results
+            loop = asyncio.get_event_loop()
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                futures = [loop.run_in_executor(executor, extract_batch, start, end) for start, end in batches]
+                for future in await asyncio.gather(*futures):
+                    for page_num, text in future:
+                        text_chunks[page_num] = text
+                        logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
+                    processed_pages += batch_size
+                    if progress_callback:
+                        progress_callback(min(processed_pages, total_pages), total_pages)
+            pdf.close()
+        else:
+            # Fallback to pdfplumber
+            with pdfplumber.open(file_path) as pdf:
+                total_pages = len(pdf.pages)
+                if total_pages == 0:
+                    return ""
+                batch_size = 5
+                batches = [(i, min(i + batch_size, total_pages)) for i in range(0, total_pages, batch_size)]
+                text_chunks = [""] * total_pages
+                processed_pages = 0
+                def extract_batch(start: int, end: int) -> List[tuple]:
+                    results = []
+                    with pdfplumber.open(file_path) as pdf:
+                        for i in range(start, end):
+                            page = pdf.pages[i]
+                            text = page.extract_text() or ""
+                            results.append((i, f"=== Page {i + 1} ===\n{text.strip()}"))
+                    return results
+                loop = asyncio.get_event_loop()
+                with ThreadPoolExecutor(max_workers=4) as executor:
+                    futures = [loop.run_in_executor(executor, extract_batch, start, end) for start, end in batches]
+                    for future in await asyncio.gather(*futures):
+                        for page_num, text in future:
+                            text_chunks[page_num] = text
+                            logger.debug("Page %d extracted: %s...", page_num + 1, text[:50])
+                        processed_pages += batch_size
+                        if progress_callback:
+                            progress_callback(min(processed_pages, total_pages), total_pages)
         extracted_text = "\n\n".join(filter(None, text_chunks))
         logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
         return extracted_text
             return cache[cache_key]
         if file_type == "pdf":
             text = asyncio.run(extract_all_pages_async(file_path, progress_callback, use_ocr=False))
             if not text.strip() or "PDF processing error" in text:
                 logger.info("Retrying extraction with OCR for %s", file_path)
 def clean_response(text: str) -> str:
     text = sanitize_utf8(text)
+    text = text.replace("[", "").replace("]", "").replace("None", "")
     text = text.replace("\n\n\n", "\n\n")
     sections = {}
     current_section = None
             current_section = section_match.group(1)
             sections.setdefault(current_section, [])
             continue
+        if current_section and line.startswith("- "):
             sections[current_section].append(line)
     cleaned = [f"### {heading}\n" + "\n".join(findings) for heading, findings in sections.items() if findings]
     result = "\n\n".join(cleaned).strip()
     logger.debug("Cleaned response length: %d chars", len(result))
+    return result or "No issues identified"
 def summarize_findings(combined_response: str) -> str:
     if not combined_response or all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
                 yield history, None, ""
                 logger.info("Extracted text length: %d chars", len(extracted))
+            chunk_size = 3000  # Adjusted for balance
+            chunks = [extracted[i:i + chunk_size] for i in range(0, max(len(extracted), 1), chunk_size)]
+            if not chunks:
+                chunks = [""]  # Ensure at least one chunk
             logger.info("Created %d chunks", len(chunks))
             combined_response = ""
             batch_size = 2
                     async def process_chunk(prompt):
                         chunk_response = ""
                         for chunk_output in agent.run_gradio_chat(
+                            message=prompt, history=[], temperature=0.2, max_new_tokens=256, max_token=1024, call_agent=False, conversation=[]
                         ):
                             if chunk_output is None:
                                 continue
                                 for m in chunk_output:
                                     if hasattr(m, 'content') and m.content:
                                         cleaned = clean_response(m.content)
+                                        chunk_response += cleaned + "\n\n"
                             elif isinstance(chunk_output, str) and chunk_output.strip():
                                 cleaned = clean_response(chunk_output)
+                                chunk_response += cleaned + "\n\n"
                         logger.debug("Chunk response length: %d chars", len(chunk_response))
                         return chunk_response
                     gc.collect()
                     for chunk_idx, chunk_response in enumerate(batch_responses, batch_idx + 1):
+                        if chunk_response.strip():
                             combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
                         else:
                             combined_response += f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"