Spaces:

prithivMLmods
/

Tiny-VLMs-Lab

Running on Zero

App Files Files Community

prithivMLmods commited on 9 days ago

Commit

6966b5a

verified ·

1 Parent(s): a327584

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -57

app.py CHANGED Viewed

@@ -9,11 +9,13 @@ import re
 import time
 from threading import Thread
 import uuid
 import gradio as gr
 import requests
 import torch
 from PIL import Image
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
@@ -28,7 +30,6 @@ from reportlab.lib.units import inch
 # --- Constants and Model Setup ---
 MAX_INPUT_TOKEN_LENGTH = 4096
-# Note: The following line correctly falls back to CPU if CUDA is not available.
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
@@ -80,43 +81,36 @@ model_i = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
-# --- PDF Generation Utility Function ---
-def generate_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str) -> str:
     """
-    Generates a PDF document with the input image and extracted text.
     """
-    if image is None or not text_content:
         raise gr.Error("Cannot generate PDF. Image or text content is missing.")
-    filename = f"/tmp/output_{uuid.uuid4()}.pdf"
     doc = SimpleDocTemplate(
-        filename,
         pagesize=A4,
-        rightMargin=inch,
-        leftMargin=inch,
-        topMargin=inch,
-        bottomMargin=inch
     )
     styles = getSampleStyleSheet()
     style_normal = styles["Normal"]
     style_normal.fontSize = int(font_size)
     style_normal.leading = int(font_size) * line_spacing
-    style_normal.alignment = {
-        "Left": 0,
-        "Center": 1,
-        "Right": 2,
-        "Justified": 4
-    }[alignment]
     story = []
-    # Handle Image
-    # Convert PIL image to a format reportlab can use without saving to disk
     img_buffer = BytesIO()
     image.save(img_buffer, format='PNG')
     img_buffer.seek(0)
-    # Image size settings
     page_width, _ = A4
     available_width = page_width - 2 * inch
     image_widths = {
@@ -124,13 +118,12 @@ def generate_pdf(image: Image.Image, text_content: str, font_size: int, line_spa
         "Medium": available_width * 0.6,
         "Large": available_width * 0.9,
     }
-    img = RLImage(img_buffer, width=image_widths[image_size], height=image.height * (image_widths[image_size]/image.width))
     story.append(img)
     story.append(Spacer(1, 12))
-    # Handle Text - Replace markdown with spaces for PDF
-    # A simple replacement for basic markdown, for more complex cases a proper parser would be needed
-    cleaned_text = text_content.replace("# ", "").replace("## ", "").replace("*", "")
     text_paragraphs = cleaned_text.split('\n')
     for para in text_paragraphs:
@@ -138,7 +131,23 @@ def generate_pdf(image: Image.Image, text_content: str, font_size: int, line_spa
             story.append(Paragraph(para, style_normal))
     doc.build(story)
-    return filename
 # --- Core Application Logic ---
@@ -151,10 +160,8 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
         yield "Please upload an image.", "Please upload an image."
         return
-    # 1. Set prompt for OCR
     text_prompt = ocr_prompt
-    # 2. Select model and processor
     if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
     elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
     elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
@@ -164,7 +171,6 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
         yield "Invalid model selected.", "Invalid model selected."
         return
-    # 3. Prepare model inputs and streamer
     messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text_prompt}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
@@ -174,7 +180,6 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    # 4. Stream raw output to the UI in real-time
     buffer = ""
     for new_text in streamer:
         buffer += new_text
@@ -182,7 +187,6 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
         time.sleep(0.01)
         yield buffer , "⏳ Processing..."
-    # 5. Yield the final result for both raw and formatted outputs
     yield buffer, buffer
@@ -193,6 +197,7 @@ def create_gradio_interface():
     .main-container { max-width: 1400px; margin: 0 auto; }
     .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
     .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
         gr.HTML("""
@@ -208,14 +213,8 @@ def create_gradio_interface():
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
-                    choices=["Camel-Doc-OCR-080125",
-                             "MonkeyOCR-Recognition",
-                             "olmOCR-7B-0725",
-                             "Nanonets-OCR-s",
-                             "Megalodon-OCR-Sync-0713"
-                            ],
-                    label="Select Model",
-                    value="Nanonets-OCR-s"
                 )
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
@@ -226,7 +225,6 @@ def create_gradio_interface():
                     alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Left", label="Text Alignment")
                     image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size in PDF")
                 process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
                 clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
@@ -238,44 +236,37 @@ def create_gradio_interface():
                         with gr.Row():
                             examples = gr.Examples(
                                 examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
-                                inputs=image_input,
-                                label="Examples"
-                        )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
                     with gr.Tab("📰 README.md"):
                         markdown_output = gr.Markdown(label="Formatted Markdown")
                     with gr.Tab("📋 PDF Preview"):
-                        pdf_output_file = gr.File(label="Generated PDF Document", interactive=False)
-                        generate_pdf_btn = gr.Button("📄 Generate PDF", variant="primary")
         # Event Handlers
         def clear_all_outputs():
-            return None, "Raw output will appear here.", "Formatted results will appear here.", None
         process_btn.click(
             fn=process_document_stream,
-            inputs=[model_choice,
-                    image_input,
-                    max_new_tokens],
-            outputs=[raw_output_stream,
-                     markdown_output]
         )
         generate_pdf_btn.click(
-            fn=generate_pdf,
             inputs=[image_input, markdown_output, font_size, line_spacing, alignment, image_size],
-            outputs=[pdf_output_file]
         )
         clear_btn.click(
             clear_all_outputs,
-            outputs=[image_input,
-                     raw_output_stream,
-                     markdown_output,
-                     pdf_output_file]
         )
     return demo

 import time
 from threading import Thread
 import uuid
+import tempfile
 import gradio as gr
 import requests
 import torch
 from PIL import Image
+import fitz
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
 # --- Constants and Model Setup ---
 MAX_INPUT_TOKEN_LENGTH = 4096
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 ).to(device).eval()
+# --- PDF Generation and Preview Utility Function ---
+def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
     """
+    Generates a PDF, saves it, and then creates image previews of its pages.
+    Returns the path to the PDF and a list of paths to the preview images.
     """
+    if image is None or not text_content or not text_content.strip():
         raise gr.Error("Cannot generate PDF. Image or text content is missing.")
+    # --- 1. Generate the PDF ---
+    temp_dir = tempfile.gettempdir()
+    pdf_filename = os.path.join(temp_dir, f"output_{uuid.uuid4()}.pdf")
     doc = SimpleDocTemplate(
+        pdf_filename,
         pagesize=A4,
+        rightMargin=inch, leftMargin=inch,
+        topMargin=inch, bottomMargin=inch
     )
     styles = getSampleStyleSheet()
     style_normal = styles["Normal"]
     style_normal.fontSize = int(font_size)
     style_normal.leading = int(font_size) * line_spacing
+    style_normal.alignment = {"Left": 0, "Center": 1, "Right": 2, "Justified": 4}[alignment]
     story = []
     img_buffer = BytesIO()
     image.save(img_buffer, format='PNG')
     img_buffer.seek(0)
     page_width, _ = A4
     available_width = page_width - 2 * inch
     image_widths = {
         "Medium": available_width * 0.6,
         "Large": available_width * 0.9,
     }
+    img_width = image_widths[image_size]
+    img = RLImage(img_buffer, width=img_width, height=image.height * (img_width / image.width))
     story.append(img)
     story.append(Spacer(1, 12))
+    cleaned_text = re.sub(r'#+\s*', '', text_content).replace("*", "")
     text_paragraphs = cleaned_text.split('\n')
     for para in text_paragraphs:
             story.append(Paragraph(para, style_normal))
     doc.build(story)
+    # --- 2. Render PDF pages as images for preview ---
+    preview_images = []
+    try:
+        pdf_doc = fitz.open(pdf_filename)
+        for page_num in range(len(pdf_doc)):
+            page = pdf_doc.load_page(page_num)
+            pix = page.get_pixmap(dpi=150) # Render at 150 DPI for good quality
+            preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
+            pix.save(preview_img_path)
+            preview_images.append(preview_img_path)
+        pdf_doc.close()
+    except Exception as e:
+        print(f"Error generating PDF preview: {e}")
+        # Continue without preview if rendering fails
+    return pdf_filename, preview_images
 # --- Core Application Logic ---
         yield "Please upload an image.", "Please upload an image."
         return
     text_prompt = ocr_prompt
     if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
     elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
     elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
         yield "Invalid model selected.", "Invalid model selected."
         return
     messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text_prompt}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
         yield buffer , "⏳ Processing..."
     yield buffer, buffer
     .main-container { max-width: 1400px; margin: 0 auto; }
     .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
     .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
+    #gallery { min-height: 400px; }
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
         gr.HTML("""
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
+                    choices=["Camel-Doc-OCR-080125", "MonkeyOCR-Recognition", "olmOCR-7B-0725", "Nanonets-OCR-s", "Megalodon-OCR-Sync-0713"],
+                    label="Select Model", value="Nanonets-OCR-s"
                 )
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
                     alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Left", label="Text Alignment")
                     image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size in PDF")
                 process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
                 clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
                         with gr.Row():
                             examples = gr.Examples(
                                 examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
+                                inputs=image_input, label="Examples"
+                            )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
                     with gr.Tab("📰 README.md"):
                         markdown_output = gr.Markdown(label="Formatted Markdown")
                     with gr.Tab("📋 PDF Preview"):
+                        generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
+                        pdf_output_file = gr.File(label="Download Generated PDF", interactive=False)
+                        pdf_preview_gallery = gr.Gallery(label="PDF Page Preview", show_label=True, elem_id="gallery", columns=2, object_fit="contain", height="auto")
         # Event Handlers
         def clear_all_outputs():
+            return None, "Raw output will appear here.", "Formatted results will appear here.", None, None
         process_btn.click(
             fn=process_document_stream,
+            inputs=[model_choice, image_input, max_new_tokens],
+            outputs=[raw_output_stream, markdown_output]
         )
         generate_pdf_btn.click(
+            fn=generate_and_preview_pdf,
             inputs=[image_input, markdown_output, font_size, line_spacing, alignment, image_size],
+            outputs=[pdf_output_file, pdf_preview_gallery]
         )
         clear_btn.click(
             clear_all_outputs,
+            outputs=[image_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
         )
     return demo