Spaces:

prithivMLmods
/

Qwen-Image-LoRA-DLC

Running on Zero

App Files Files Community

prithivMLmods commited on about 1 month ago

Commit

ac626f9

verified ·

1 Parent(s): 39360bb

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -103

app.py CHANGED Viewed

@@ -10,6 +10,9 @@ from io import BytesIO
 from PIL import Image
 from gradio_pdf import PDF
 from loguru import logger
 from datetime import datetime
 from pathlib import Path
 import torch
@@ -24,17 +27,6 @@ import tempfile
 pdf_suffixes = [".pdf"]
 image_suffixes = [".png", ".jpeg", ".jpg"]
-# LaTeX delimiter configurations
-latex_delimiters_type_a = [
-    {'left': '$$', 'right': '$$', 'display': True},
-    {'left': '$', 'right': '$', 'display': False},
-]
-latex_delimiters_type_b = [
-    {'left': '\\(', 'right': '\\)', 'display': False},
-    {'left': '\\[', 'right': '\\]', 'display': True},
-]
-latex_delimiters_type_all = latex_delimiters_type_a + latex_delimiters_type_b
 # --- Model and Processor Initialization ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_ID = "Logics-MLLM/Logics-Parsing"
@@ -131,7 +123,9 @@ def to_pdf(file_path: str) -> str or None:
     pdf_bytes = read_fn(file_path)
     unique_filename = f'{safe_stem(file_path)}.pdf'
-    tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
     with open(tmp_file_path, 'wb') as tmp_pdf_file:
         tmp_pdf_file.write(pdf_bytes)
@@ -139,76 +133,86 @@ def to_pdf(file_path: str) -> str or None:
     return tmp_file_path
-def convert_pdf_to_images_fitz(pdf_path: str, dpi: int = 200) -> list:
-    """
-    Converts a PDF file to a list of PIL Images using PyMuPDF (fitz).
-    This function replaces the need for the external Poppler dependency.
-    """
-    images = []
-    logger.info(f"Converting PDF '{pdf_path}' to images with PyMuPDF.")
-    try:
-        pdf_document = fitz.open(pdf_path)
-        # Calculate zoom factor based on desired DPI
-        zoom = dpi / 72.0
-        mat = fitz.Matrix(zoom, zoom)
-        for page_num in range(len(pdf_document)):
-            page = pdf_document.load_page(page_num)
-            pix = page.get_pixmap(matrix=mat)
-            img_data = pix.tobytes("png")  # Use PNG for better quality
-            image = Image.open(BytesIO(img_data))
-            images.append(image)
-        pdf_document.close()
-        logger.info(f"Successfully converted {len(images)} pages.")
-    except Exception as e:
-        logger.error(f"Failed to convert PDF using PyMuPDF: {e}")
-        raise
-    return images
 async def pdf_parse(file_path: str, request: gr.Request):
     """
     Main parsing function that orchestrates the PDF processing pipeline.
     """
     if file_path is None:
         logger.warning("file_path is None")
         return (
-            "<p>Please upload a PDF file</p>",
-            "",
-            "<p>No input file</p>",
-            None,
-            None,
-            "Error: No file provided"
         )
     logger.info(f'Processing file: {file_path}')
-    # Ensure file is in PDF format
     tmp_pdf_path = to_pdf(file_path)
     if tmp_pdf_path is None:
         return (
-            "<p>Failed to process file</p>",
-            "",
-            "<p>Processing error</p>",
-            None,
-            None,
-            "Error: Failed to process file"
         )
     start_time = time.time()
     try:
-        # ** FIX: Use PyMuPDF (fitz) instead of pdf2image to avoid Poppler dependency **
-        pages = convert_pdf_to_images_fitz(tmp_pdf_path, dpi=200)
         html_parts = []
-        for i, page in enumerate(pages):
-            logger.info(f"Parsing page {i+1}/{len(pages)}")
-            html = parse_page(page)
-            html_parts.append(f'<div class="page-{i+1}">{html}</div>')
         full_html = '\n'.join(html_parts)
         parsing_time = time.time() - start_time
-        # Convert generated HTML to Markdown
         mmd = html2text.html2text(full_html)
         mmd_html = markdown.markdown(mmd)
         qwen_html = full_html
@@ -218,20 +222,17 @@ async def pdf_parse(file_path: str, request: gr.Request):
             f.write(mmd)
             md_path = f.name
-        input_path = tmp_pdf_path
         cost_time = f'Parsing time: {parsing_time:.2f}s, Total time: {parsing_time:.2f}s'
-        return mmd_html, mmd, qwen_html, md_path, input_path, cost_time
     except Exception as e:
         logger.error(f"Parsing failed: {e}")
         return (
-            "<p>Parsing failed. Please try again.</p>",
-            "",
-            f"<p>Error: {str(e)}</p>",
-            None,
-            None,
-            f"Error: {str(e)}"
         )
@@ -241,36 +242,13 @@ def main(ctx, **kwargs):
     """
     Sets up and launches the Gradio user interface.
     """
-    with gr.Blocks(head='''
-        <meta name="data-spm" content="label" />
-      <meta name="aplus-core" content="aplus.js" />
-      <meta name="aplus-ifr-pv" content="1"/>
-      <meta name="aplus-iframe-ignore-i" content="on" />
-      <script>
-        window.APLUS_CONFIG = {
-            pid: 'aidata',
-        };
-        (function (w, d, s, q) {
-            w[q] = w[q] || [];
-            var f = d.getElementsByTagName(s)[0],
-                j = d.createElement(s);
-            j.async = true;
-            j.id = 'beacon-aplus';
-            var userIdParam = '';
-            j.setAttribute(
-                'exparams',
-                'userid=' +
-                    userIdParam +
-                    '&aplus&sidx=aplusSidex&ckx=aplusCkx'
-            );
-            j.src = '//g.alicdn.com/alilog/mlog/aplus_v2.js';
-            j.crossorigin = 'anonymous';
-            f.parentNode.insertBefore(j, f);
-        })(window, document, 'script', 'aplus_queue');
-      </script>
-    ''') as demo:
         gr.Markdown("# 📄 Logics-Parsing Document Analysis")
-        gr.Markdown("Upload a PDF or image file to parse its content into structured Markdown and HTML formats.")
         with gr.Row():
             with gr.Column(variant='panel', scale=5):
                 with gr.Row():
@@ -287,10 +265,7 @@ def main(ctx, **kwargs):
                     example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
                     if example_files:
                         with gr.Accordion('Examples:', open=True):
-                            gr.Examples(
-                                examples=example_files,
-                                inputs=input_file
-                            )
             with gr.Column(variant='panel', scale=5):
                 output_file = gr.File(label='Download Markdown Result', interactive=False)
@@ -303,9 +278,7 @@ def main(ctx, **kwargs):
                     with gr.Tab('Generated HTML'):
                         raw_html = gr.TextArea(lines=45, show_copy_button=True, label="Generated HTML")
-        # Define component list for clearing
         components_to_clear = [input_file, pdf_show, mmd, raw_html, output_file, mmd_html, cost_time]
         clear_bu.add(components_to_clear)
         input_file.change(fn=to_pdf, inputs=input_file, outputs=pdf_show, show_progress="full")

 from PIL import Image
 from gradio_pdf import PDF
 from loguru import logger
+import sys # Added for logging configuration
+import base64 # Added for image encoding
+from bs4 import BeautifulSoup # Added for HTML manipulation
 from datetime import datetime
 from pathlib import Path
 import torch
 pdf_suffixes = [".pdf"]
 image_suffixes = [".png", ".jpeg", ".jpg"]
 # --- Model and Processor Initialization ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_ID = "Logics-MLLM/Logics-Parsing"
     pdf_bytes = read_fn(file_path)
     unique_filename = f'{safe_stem(file_path)}.pdf'
+    # Use Gradio's temp directory for temporary files
+    tmp_dir = tempfile.gettempdir()
+    tmp_file_path = os.path.join(tmp_dir, unique_filename)
     with open(tmp_file_path, 'wb') as tmp_pdf_file:
         tmp_pdf_file.write(pdf_bytes)
     return tmp_file_path
 async def pdf_parse(file_path: str, request: gr.Request):
     """
     Main parsing function that orchestrates the PDF processing pipeline.
+    It now extracts images directly and injects them into the final HTML.
     """
     if file_path is None:
         logger.warning("file_path is None")
         return (
+            "<p>Please upload a PDF file</p>", "", "<p>No input file</p>",
+            None, None, "Error: No file provided"
         )
     logger.info(f'Processing file: {file_path}')
     tmp_pdf_path = to_pdf(file_path)
     if tmp_pdf_path is None:
         return (
+            "<p>Failed to process file</p>", "", "<p>Processing error</p>",
+            None, None, "Error: Failed to process file"
         )
     start_time = time.time()
     try:
+        pdf_document = fitz.open(tmp_pdf_path)
         html_parts = []
+        # Process each page
+        for page_num in range(len(pdf_document)):
+            page = pdf_document.load_page(page_num)
+            logger.info(f"Processing Page {page_num + 1}/{len(pdf_document)}")
+            # --- 1. Extract images directly from the PDF page using PyMuPDF ---
+            page_images_base64 = []
+            img_list = page.get_images(full=True)
+            for img_index, img in enumerate(img_list):
+                xref = img[0]
+                base_image = pdf_document.extract_image(xref)
+                image_bytes = base_image["image"]
+                image_ext = base_image["ext"]
+                base64_string = f"data:image/{image_ext};base64,{base64.b64encode(image_bytes).decode()}"
+                page_images_base64.append(base64_string)
+            logger.info(f"  > Found {len(page_images_base64)} images on page {page_num + 1}.")
+            # --- 2. Render the page to an image for the VL-Model ---
+            zoom = 200 / 72.0  # Corresponds to 200 DPI
+            mat = fitz.Matrix(zoom, zoom)
+            pix = page.get_pixmap(matrix=mat)
+            page_image = Image.open(BytesIO(pix.tobytes("png")))
+            # --- 3. Get the structured HTML from the model ---
+            logger.info(f"  > Parsing page layout with Logics-Parsing model...")
+            html_content = parse_page(page_image)
+            # --- 4. Inject extracted images back into the HTML ---
+            if page_images_base64:
+                logger.info(f"  > Injecting {len(page_images_base64)} extracted images into generated HTML...")
+                soup = BeautifulSoup(html_content, 'html.parser')
+                figures = soup.find_all('figure')
+                # If model identified same number of figures, inject images into them
+                if len(figures) == len(page_images_base64):
+                    for fig, b64_img in zip(figures, page_images_base64):
+                        img_tag = soup.new_tag('img', src=b64_img, style="max-width:100%; height:auto;")
+                        fig.append(img_tag)
+                else: # Otherwise, append all images at the end of the page content as a fallback
+                    logger.warning(f"  > Mismatch: Model found {len(figures)} figures, but {len(page_images_base64)} images were extracted. Appending images to the end.")
+                    for b64_img in page_images_base64:
+                        img_tag = soup.new_tag('img', src=b64_img, style="max-width:100%; height:auto;")
+                        p_tag = soup.new_tag('p')
+                        p_tag.append(img_tag)
+                        soup.append(p_tag)
+                html_content = str(soup)
+            html_parts.append(f'<div class="page-{page_num+1}">{html_content}</div>')
+        pdf_document.close()
         full_html = '\n'.join(html_parts)
         parsing_time = time.time() - start_time
+        # Convert final rich HTML to Markdown
         mmd = html2text.html2text(full_html)
         mmd_html = markdown.markdown(mmd)
         qwen_html = full_html
             f.write(mmd)
             md_path = f.name
         cost_time = f'Parsing time: {parsing_time:.2f}s, Total time: {parsing_time:.2f}s'
+        return mmd_html, mmd, qwen_html, md_path, tmp_pdf_path, cost_time
     except Exception as e:
         logger.error(f"Parsing failed: {e}")
+        import traceback
+        traceback.print_exc()
         return (
+            "<p>Parsing failed. Please try again.</p>", "", f"<p>Error: {str(e)}</p>",
+            None, None, f"Error: {str(e)}"
         )
     """
     Sets up and launches the Gradio user interface.
     """
+    # **FIX: Configure Loguru for better visibility in deployment environments**
+    logger.remove() # Remove default handler
+    logger.add(sys.stdout, level="INFO")
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("# 📄 Logics-Parsing Document Analysis")
+        gr.Markdown("Upload a PDF or image file to parse its content into structured Markdown and HTML formats, now with improved image extraction.")
         with gr.Row():
             with gr.Column(variant='panel', scale=5):
                 with gr.Row():
                     example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
                     if example_files:
                         with gr.Accordion('Examples:', open=True):
+                            gr.Examples(examples=example_files, inputs=input_file)
             with gr.Column(variant='panel', scale=5):
                 output_file = gr.File(label='Download Markdown Result', interactive=False)
                     with gr.Tab('Generated HTML'):
                         raw_html = gr.TextArea(lines=45, show_copy_button=True, label="Generated HTML")
         components_to_clear = [input_file, pdf_show, mmd, raw_html, output_file, mmd_html, cost_time]
         clear_bu.add(components_to_clear)
         input_file.change(fn=to_pdf, inputs=input_file, outputs=pdf_show, show_progress="full")