CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 18

Commit

c0b2cb7

verified ·

1 Parent(s): 55c02ee

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -22

app.py CHANGED Viewed

@@ -81,35 +81,45 @@ async def extract_all_pages_async(file_path: str, progress_callback=None, force_
             def extract_page(i):
                 page = pdf.pages[i]
-                # Try table extraction first
                 text = ""
-                tables = page.extract_tables()
                 if tables:
                     for table in tables:
-                        # Mimic Excel/CSV: join non-None cells as strings
                         table_text = "\n".join(
                             " | ".join(str(cell) if cell is not None else "" for cell in row)
-                            for row in table
                         )
                         text += table_text + "\n\n"
                     logger.debug("Page %d extracted %d tables, text length: %d chars", i + 1, len(tables), len(text))
                 else:
-                    # Fall back to raw text
                     text = page.extract_text() or ""
                     logger.debug("Page %d no tables, raw text length: %d chars", i + 1, len(text))
-                # OCR if text is short or force_ocr is True
                 if (not text.strip() or len(text) < 100 or force_ocr) and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
                     try:
                         logger.info("Attempting OCR for page %d", i + 1)
                         pdfium_pdf = pdfium.PdfDocument(file_path)
                         page_bitmap = pdfium_pdf[i].render(scale=2).to_pil()
-                        text = pytesseract.image_to_string(page_bitmap, lang="eng")
-                        logger.debug("Page %d OCR text length: %d chars", i + 1, len(text))
                         pdfium_pdf.close()
                     except Exception as e:
                         logger.error("OCR failed for page %d: %s", i + 1, e)
-                        text = text or ""
                 return (i, f"=== Page {i + 1} ===\n{text.strip()}")
             with ThreadPoolExecutor(max_workers=4) as executor:
@@ -124,6 +134,7 @@ async def extract_all_pages_async(file_path: str, progress_callback=None, force_
         text_chunks.sort(key=lambda x: x[0])
         extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
         logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
         if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
             logger.info("Text too short, forcing OCR for all pages")
             return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
@@ -136,25 +147,54 @@ def convert_file_to_json(file_path: str, file_type: str, progress_callback=None)
     try:
         file_h = file_hash(file_path)
         cache_key = f"{file_h}_{file_type}"
-        if cache_key in cache:
-            logger.info("Using cached extraction for %s", file_path)
-            return cache[cache_key]
         if file_type == "pdf":
             text = asyncio.run(extract_all_pages_async(file_path, progress_callback, force_ocr=False))
             result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
         elif file_type == "csv":
-            df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
-                             skip_blank_lines=False, on_bad_lines="skip")
-            content = df.fillna("").astype(str).values.tolist()
-            result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         elif file_type in ["xls", "xlsx"]:
             try:
-                df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
-            except Exception:
-                df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
-            content = df.fillna("").astype(str).values.tolist()
-            result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         else:
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
@@ -324,7 +364,7 @@ Patient Record Excerpt:
                         chunk_response = ""
                         raw_outputs = []
                         for chunk_output in agent.run_gradio_chat(
-                            message=prompt, history=[], temperature=0.2, max_new_tokens=512, max_token=1024, call_agent=False, conversation=[]
                         ):
                             if chunk_output is None:
                                 continue

             def extract_page(i):
                 page = pdf.pages[i]
                 text = ""
+                # Adjust table settings for complex layouts
+                table_settings = {
+                    "vertical_strategy": "lines",
+                    "horizontal_strategy": "lines",
+                    "explicit_vertical_lines": [],
+                    "explicit_horizontal_lines": [],
+                    "snap_tolerance": 5,
+                    "join_tolerance": 5,
+                    "edge_min_length": 3,
+                    "min_words_vertical": 3,
+                    "min_words_horizontal": 1,
+                    "intersection_tolerance": 5,
+                }
+                tables = page.extract_tables(table_settings=table_settings)
                 if tables:
                     for table in tables:
                         table_text = "\n".join(
                             " | ".join(str(cell) if cell is not None else "" for cell in row)
+                            for row in table if any(cell is not None for cell in row)
                         )
                         text += table_text + "\n\n"
                     logger.debug("Page %d extracted %d tables, text length: %d chars", i + 1, len(tables), len(text))
                 else:
                     text = page.extract_text() or ""
                     logger.debug("Page %d no tables, raw text length: %d chars", i + 1, len(text))
+                # Force OCR if text is short or force_ocr is True
                 if (not text.strip() or len(text) < 100 or force_ocr) and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
                     try:
                         logger.info("Attempting OCR for page %d", i + 1)
                         pdfium_pdf = pdfium.PdfDocument(file_path)
                         page_bitmap = pdfium_pdf[i].render(scale=2).to_pil()
+                        ocr_text = pytesseract.image_to_string(page_bitmap, lang="eng")
+                        logger.debug("Page %d OCR text length: %d chars", i + 1, len(ocr_text))
+                        text = ocr_text if ocr_text.strip() else text
                         pdfium_pdf.close()
                     except Exception as e:
                         logger.error("OCR failed for page %d: %s", i + 1, e)
                 return (i, f"=== Page {i + 1} ===\n{text.strip()}")
             with ThreadPoolExecutor(max_workers=4) as executor:
         text_chunks.sort(key=lambda x: x[0])
         extracted_text = "\n\n".join(chunk[1] for chunk in text_chunks if chunk[1].strip())
         logger.info("Extracted %d pages, total length: %d chars", total_pages, len(extracted_text))
+        # Force OCR retry if text is too short
         if len(extracted_text) < 1000 and not force_ocr and HAS_PYPDFIUM2 and 'pytesseract' in sys.modules:
             logger.info("Text too short, forcing OCR for all pages")
             return await extract_all_pages_async(file_path, progress_callback, force_ocr=True)
     try:
         file_h = file_hash(file_path)
         cache_key = f"{file_h}_{file_type}"
+        # Bypass cache to force fresh extraction
+        logger.info("Forcing fresh extraction for %s", file_path)
+        # if cache_key in cache:
+        #     logger.info("Using cached extraction for %s", file_path)
+        #     return cache[cache_key]
         if file_type == "pdf":
             text = asyncio.run(extract_all_pages_async(file_path, progress_callback, force_ocr=False))
             result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
         elif file_type == "csv":
+            try:
+                df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
+                                 skip_blank_lines=False, on_bad_lines="skip")
+                content = df.fillna("").astype(str).values.tolist()
+                result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
+                logger.info("CSV processed, rows: %d", len(content))
+            except Exception as e:
+                logger.error("CSV processing failed: %s", e)
+                result = json.dumps({"error": f"CSV processing failed: {str(e)}"})
         elif file_type in ["xls", "xlsx"]:
             try:
+                # Try all sheets to maximize data
+                xl = pd.ExcelFile(file_path, engine="openpyxl")
+                content = []
+                for sheet_name in xl.sheet_names:
+                    try:
+                        df = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl", header=None, dtype=str)
+                        sheet_content = df.fillna("").astype(str).values.tolist()
+                        content.extend(sheet_content)
+                        logger.debug("Excel sheet %s processed, rows: %d", sheet_name, len(sheet_content))
+                    except Exception as e:
+                        logger.warning("Excel sheet %s failed: %s", sheet_name, e)
+                if not content:
+                    logger.error("No valid data extracted from Excel")
+                    result = json.dumps({"error": "No valid data extracted from Excel"})
+                else:
+                    result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
+                    logger.info("Excel processed, total rows: %d", len(content))
+            except Exception as e:
+                logger.error("Excel processing failed: %s", e)
+                try:
+                    df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
+                    content = df.fillna("").astype(str).values.tolist()
+                    result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
+                    logger.info("Excel processed with xlrd, rows: %d", len(content))
+                except Exception as e2:
+                    logger.error("Excel processing failed with xlrd: %s", e2)
+                    result = json.dumps({"error": f"Excel processing failed: {str(e)}"})
         else:
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
                         chunk_response = ""
                         raw_outputs = []
                         for chunk_output in agent.run_gradio_chat(
+                            message=prompt, history=[], temperature=0.2, max_new_tokens=512, max_token=2048, call_agent=False, conversation=[]
                         ):
                             if chunk_output is None:
                                 continue