Spaces:

prithivMLmods
/

Tiny-VLMs-Lab

Running on Zero

App Files Files Community

prithivMLmods commited on 6 days ago

Commit

a327584

verified ·

1 Parent(s): b051d42

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -80

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import spaces
 import json
 import os
 import traceback
 from io import BytesIO
@@ -7,6 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple
 import re
 import time
 from threading import Thread
 import gradio as gr
 import requests
@@ -18,15 +20,15 @@ from transformers import (
     AutoProcessor,
     TextIteratorStreamer,
 )
 from reportlab.lib.pagesizes import A4
 from reportlab.lib.styles import getSampleStyleSheet
-from reportlab.lib import colors
 from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
 from reportlab.lib.units import inch
-import uuid
 # --- Constants and Model Setup ---
 MAX_INPUT_TOKEN_LENGTH = 4096
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
@@ -40,6 +42,9 @@ if torch.cuda.is_available():
 print("Using device:", device)
 # --- Model Loading ---
 MODEL_ID_M = "prithivMLmods/Camel-Doc-OCR-080125"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -74,13 +79,16 @@ model_i = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_I, trust_remote_code=True, torch_dtype=torch.float16
 ).to(device).eval()
-# --- Prompts ---
-ocr_prompt = "Perform precise OCR on the image. Extract all text content, maintaining the original structure, paragraphs, and tables as formatted markdown."
-# --- PDF Generation Functions ---
-def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size):
-    """Generates a PDF document."""
-    filename = f"output_{uuid.uuid4()}.pdf"
     doc = SimpleDocTemplate(
         filename,
         pagesize=A4,
@@ -90,9 +98,10 @@ def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, ima
         bottomMargin=inch
     )
     styles = getSampleStyleSheet()
-    styles["Normal"].fontSize = int(font_size)
-    styles["Normal"].leading = int(font_size) * line_spacing
-    styles["Normal"].alignment = {
         "Left": 0,
         "Center": 1,
         "Right": 2,
@@ -101,49 +110,61 @@ def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, ima
     story = []
-    # Add image with size adjustment
-    image_sizes = {
-        "Small": (200, 200),
-        "Medium": (400, 400),
-        "Large": (600, 600)
     }
-    img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])
     story.append(img)
     story.append(Spacer(1, 12))
-    # Add plain text output
-    text = Paragraph(plain_text, styles["Normal"])
-    story.append(text)
     doc.build(story)
     return filename
 # --- Core Application Logic ---
 @spaces.GPU
-def process_document_stream(model_name: str, image: Image.Image, max_new_tokens: int, font_size: str, line_spacing: float, alignment: str, image_size: str):
     """
-    Main generator function for OCR task, also generating PDF for preview.
     """
     if image is None:
-        yield "Please upload an image.", "Please upload an image.", None
         return
-    # Select model and processor
     if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
     elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
     elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
     elif model_name == "MonkeyOCR-Recognition": processor, model = processor_g, model_g
     elif model_name == "olmOCR-7B-0725": processor, model = processor_i, model_i
     else:
-        yield "Invalid model selected.", "Invalid model selected.", None
         return
-    # Save image temporarily for PDF generation
-    temp_image_path = f"temp_{uuid.uuid4()}.png"
-    image.save(temp_image_path)
-    # Prepare model inputs and streamer
-    text_prompt = ocr_prompt
     messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text_prompt}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
@@ -153,23 +174,17 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    # Stream raw output to the UI in real-time
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
-        # Generate PDF with current buffer
-        pdf_file = generate_pdf(temp_image_path, buffer, font_size, line_spacing, alignment, image_size)
-        yield buffer, buffer, pdf_file
-    # Final PDF with complete output
-    pdf_file = generate_pdf(temp_image_path, buffer, font_size, line_spacing, alignment, image_size)
-    yield buffer, buffer, pdf_file
-    # Clean up temporary image file
-    if os.path.exists(temp_image_path):
-        os.remove(temp_image_path)
 # --- Gradio UI Definition ---
 def create_gradio_interface():
@@ -178,15 +193,13 @@ def create_gradio_interface():
     .main-container { max-width: 1400px; margin: 0 auto; }
     .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
     .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
-    .download-btn { background-color: #35a6d6 !important; color: white !important; }
-    .download-btn:hover { background-color: #22bcff !important; }
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
         gr.HTML("""
         <div class="title" style="text-align: center">
             <h1>Tiny VLMs Lab🧪</h1>
             <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
-                Advanced Vision-Language Model for Image Content Extraction and PDF Generation
             </p>
         </div>
         """)
@@ -195,39 +208,24 @@ def create_gradio_interface():
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
-                    choices=[
-                        "Camel-Doc-OCR-080125",
-                        "MonkeyOCR-Recognition",
-                        "olmOCR-7B-0725",
-                        "Nanonets-OCR-s",
-                        "Megalodon-OCR-Sync-0713"
-                    ],
                     label="Select Model",
                     value="Nanonets-OCR-s"
                 )
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
-                    font_size = gr.Dropdown(
-                        choices=["8", "10", "12", "14", "16", "18", "20", "22", "24"],
-                        value="16",
-                        label="Font Size"
-                    )
-                    line_spacing = gr.Dropdown(
-                        choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0],
-                        value=1.5,
-                        label="Line Spacing"
-                    )
-                    alignment = gr.Dropdown(
-                        choices=["Left", "Center", "Right", "Justified"],
-                        value="Justified",
-                        label="Text Alignment"
-                    )
-                    image_size = gr.Dropdown(
-                        choices=["Small", "Medium", "Large"],
-                        value="Medium",
-                        label="Image Size"
-                    )
                 process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
                 clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
@@ -236,19 +234,22 @@ def create_gradio_interface():
             with gr.Column(scale=2):
                 with gr.Tabs() as tabs:
                     with gr.Tab("📝 Extracted Content"):
-                        raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=13, show_copy_button=True)
                         with gr.Row():
                             examples = gr.Examples(
                                 examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
                                 inputs=image_input,
                                 label="Examples"
-                            )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
                     with gr.Tab("📰 README.md"):
-                        with gr.Accordion("(Formatted Result)", open=True):
-                            markdown_output = gr.Markdown(label="Formatted Markdown")
                     with gr.Tab("📋 PDF Preview"):
-                        pdf_output = gr.File(label="Download PDF", interactive=True)
         # Event Handlers
         def clear_all_outputs():
@@ -256,12 +257,25 @@ def create_gradio_interface():
         process_btn.click(
             fn=process_document_stream,
-            inputs=[model_choice, image_input, max_new_tokens, font_size, line_spacing, alignment, image_size],
-            outputs=[raw_output_stream, markdown_output, pdf_output]
         )
         clear_btn.click(
-            fn=clear_all_outputs,
-            outputs=[image_input, raw_output_stream, markdown_output, pdf_output]
         )
     return demo

 import spaces
 import json
+import math
 import os
 import traceback
 from io import BytesIO
 import re
 import time
 from threading import Thread
+import uuid
 import gradio as gr
 import requests
     AutoProcessor,
     TextIteratorStreamer,
 )
 from reportlab.lib.pagesizes import A4
 from reportlab.lib.styles import getSampleStyleSheet
 from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
 from reportlab.lib.units import inch
 # --- Constants and Model Setup ---
 MAX_INPUT_TOKEN_LENGTH = 4096
+# Note: The following line correctly falls back to CPU if CUDA is not available.
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 print("Using device:", device)
+# --- Prompts for Different Tasks ---
+ocr_prompt = "Perform precise OCR on the image. Extract all text content, maintaining the original structure, paragraphs, and tables as formatted markdown."
 # --- Model Loading ---
 MODEL_ID_M = "prithivMLmods/Camel-Doc-OCR-080125"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
     MODEL_ID_I, trust_remote_code=True, torch_dtype=torch.float16
 ).to(device).eval()
+# --- PDF Generation Utility Function ---
+def generate_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str) -> str:
+    """
+    Generates a PDF document with the input image and extracted text.
+    """
+    if image is None or not text_content:
+        raise gr.Error("Cannot generate PDF. Image or text content is missing.")
+    filename = f"/tmp/output_{uuid.uuid4()}.pdf"
     doc = SimpleDocTemplate(
         filename,
         pagesize=A4,
         bottomMargin=inch
     )
     styles = getSampleStyleSheet()
+    style_normal = styles["Normal"]
+    style_normal.fontSize = int(font_size)
+    style_normal.leading = int(font_size) * line_spacing
+    style_normal.alignment = {
         "Left": 0,
         "Center": 1,
         "Right": 2,
     story = []
+    # Handle Image
+    # Convert PIL image to a format reportlab can use without saving to disk
+    img_buffer = BytesIO()
+    image.save(img_buffer, format='PNG')
+    img_buffer.seek(0)
+    # Image size settings
+    page_width, _ = A4
+    available_width = page_width - 2 * inch
+    image_widths = {
+        "Small": available_width * 0.3,
+        "Medium": available_width * 0.6,
+        "Large": available_width * 0.9,
     }
+    img = RLImage(img_buffer, width=image_widths[image_size], height=image.height * (image_widths[image_size]/image.width))
     story.append(img)
     story.append(Spacer(1, 12))
+    # Handle Text - Replace markdown with spaces for PDF
+    # A simple replacement for basic markdown, for more complex cases a proper parser would be needed
+    cleaned_text = text_content.replace("# ", "").replace("## ", "").replace("*", "")
+    text_paragraphs = cleaned_text.split('\n')
+    for para in text_paragraphs:
+        if para.strip():
+            story.append(Paragraph(para, style_normal))
     doc.build(story)
     return filename
 # --- Core Application Logic ---
 @spaces.GPU
+def process_document_stream(model_name: str, image: Image.Image, max_new_tokens: int):
     """
+    Main generator function that handles OCR tasks.
     """
     if image is None:
+        yield "Please upload an image.", "Please upload an image."
         return
+    # 1. Set prompt for OCR
+    text_prompt = ocr_prompt
+    # 2. Select model and processor
     if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
     elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
     elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
     elif model_name == "MonkeyOCR-Recognition": processor, model = processor_g, model_g
     elif model_name == "olmOCR-7B-0725": processor, model = processor_i, model_i
     else:
+        yield "Invalid model selected.", "Invalid model selected."
         return
+    # 3. Prepare model inputs and streamer
     messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text_prompt}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    # 4. Stream raw output to the UI in real-time
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
+        yield buffer , "⏳ Processing..."
+    # 5. Yield the final result for both raw and formatted outputs
+    yield buffer, buffer
 # --- Gradio UI Definition ---
 def create_gradio_interface():
     .main-container { max-width: 1400px; margin: 0 auto; }
     .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
     .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
         gr.HTML("""
         <div class="title" style="text-align: center">
             <h1>Tiny VLMs Lab🧪</h1>
             <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
+                Advanced Vision-Language Model for Image Content and Layout Extraction
             </p>
         </div>
         """)
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
+                    choices=["Camel-Doc-OCR-080125",
+                             "MonkeyOCR-Recognition",
+                             "olmOCR-7B-0725",
+                             "Nanonets-OCR-s",
+                             "Megalodon-OCR-Sync-0713"
+                            ],
                     label="Select Model",
                     value="Nanonets-OCR-s"
                 )
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
+                    gr.Markdown("### PDF Export Settings")
+                    font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
+                    line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
+                    alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Left", label="Text Alignment")
+                    image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size in PDF")
                 process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
                 clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
             with gr.Column(scale=2):
                 with gr.Tabs() as tabs:
                     with gr.Tab("📝 Extracted Content"):
+                        raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
                         with gr.Row():
                             examples = gr.Examples(
                                 examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
                                 inputs=image_input,
                                 label="Examples"
+                        )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
                     with gr.Tab("📰 README.md"):
+                        markdown_output = gr.Markdown(label="Formatted Markdown")
                     with gr.Tab("📋 PDF Preview"):
+                        pdf_output_file = gr.File(label="Generated PDF Document", interactive=False)
+                        generate_pdf_btn = gr.Button("📄 Generate PDF", variant="primary")
         # Event Handlers
         def clear_all_outputs():
         process_btn.click(
             fn=process_document_stream,
+            inputs=[model_choice,
+                    image_input,
+                    max_new_tokens],
+            outputs=[raw_output_stream,
+                     markdown_output]
         )
+        generate_pdf_btn.click(
+            fn=generate_pdf,
+            inputs=[image_input, markdown_output, font_size, line_spacing, alignment, image_size],
+            outputs=[pdf_output_file]
+        )
         clear_btn.click(
+            clear_all_outputs,
+            outputs=[image_input,
+                     raw_output_stream,
+                     markdown_output,
+                     pdf_output_file]
         )
     return demo