Spaces:

prithivMLmods
/

Tiny-VLMs-Lab

Running on Zero

App Files Files Community

prithivMLmods commited on 6 days ago

Commit

b051d42

verified ·

1 Parent(s): 23ca3e5

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -148

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import spaces
 import json
-import math
 import os
 import traceback
 from io import BytesIO
@@ -15,22 +14,19 @@ import torch
 from PIL import Image
 from transformers import (
-    Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
-    AutoModelForImageTextToText,
     AutoProcessor,
     TextIteratorStreamer,
-    AutoModel,
-    AutoTokenizer,
 )
-from transformers.image_utils import load_image
 # --- Constants and Model Setup ---
 MAX_INPUT_TOKEN_LENGTH = 4096
-# Note: The following line correctly falls back to CPU if CUDA is not available.
-# Let the environment (e.g., Hugging Face Spaces) determine the device.
-# This avoids conflicts with the CUDA environment setup by the platform.
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
@@ -44,24 +40,6 @@ if torch.cuda.is_available():
 print("Using device:", device)
-# --- Model Loading ---
-# --- Prompts for Different Tasks ---
-layout_prompt = """Please output the layout information from the image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
-1. Bbox format: [x1, y1, x2, y2]
-2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
-3. Text Extraction & Formatting Rules:
-    - For tables, provide the content in a structured JSON format.
-    - For all other elements, provide the plain text.
-4. Constraints:
-    - The output must be the original text from the image.
-    - All layout elements must be sorted according to human reading order.
-5. Final Output: The entire output must be a single JSON object wrapped in ```json ... ```.
-"""
-ocr_prompt = "Perform precise OCR on the image. Extract all text content, maintaining the original structure, paragraphs, and tables as formatted markdown."
 # --- Model Loading ---
 MODEL_ID_M = "prithivMLmods/Camel-Doc-OCR-080125"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -96,78 +74,61 @@ model_i = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_I, trust_remote_code=True, torch_dtype=torch.float16
 ).to(device).eval()
-# --- Utility Functions ---
-def layoutjson2md(layout_data: Any) -> str:
-    """
-    FIXED: Converts the structured JSON from Layout Analysis into formatted Markdown.
-    This version is robust against malformed JSON from the model.
-    """
-    markdown_lines = []
-    # If the model wraps the list in a dictionary, find and extract the list.
-    if isinstance(layout_data, dict):
-        found_list = None
-        for value in layout_data.values():
-            if isinstance(value, list):
-                found_list = value
-                break
-        if found_list is not None:
-            layout_data = found_list
-        else:
-            return "### Error: Could not find a list of layout items in the JSON object."
-    if not isinstance(layout_data, list):
-        return f"### Error: Expected a list of layout items, but received type {type(layout_data).__name__}."
-    try:
-        # Filter out any non-dictionary items and sort by reading order.
-        valid_items = [item for item in layout_data if isinstance(item, dict)]
-        sorted_items = sorted(valid_items, key=lambda x: (x.get('bbox', [0, 0, 0, 0])[1], x.get('bbox', [0, 0, 0, 0])[0]))
-        for item in sorted_items:
-            category = item.get('category', 'Text') # Default to 'Text' if no category
-            text = item.get('text', '')
-            if not text:
-                continue
-            if category == 'Title':
-                markdown_lines.append(f"# {text}\n")
-            elif category == 'Section-header':
-                markdown_lines.append(f"## {text}\n")
-            elif category == 'Table':
-                if isinstance(text, dict) and 'header' in text and 'rows' in text:
-                    header = '| ' + ' | '.join(map(str, text['header'])) + ' |'
-                    separator = '| ' + ' | '.join(['---'] * len(text['header'])) + ' |'
-                    rows = ['| ' + ' | '.join(map(str, row)) + ' |' for row in text['rows']]
-                    markdown_lines.extend([header, separator] + rows)
-                    markdown_lines.append("\n")
-                else:  # Fallback for simple text or malformed tables
-                    markdown_lines.append(f"{text}\n")
-            else:
-                markdown_lines.append(f"{text}\n")
-    except Exception as e:
-        print(f"Error converting to markdown: {e}")
-        traceback.print_exc()
-        return "### Error: An unexpected error occurred while converting JSON to Markdown."
-    return "\n".join(markdown_lines)
 # --- Core Application Logic ---
 @spaces.GPU
-def process_document_stream(model_name: str, task_choice: str, image: Image.Image, max_new_tokens: int):
     """
-    Main generator function that handles both OCR and Layout Analysis tasks.
     """
     if image is None:
         yield "Please upload an image.", "Please upload an image.", None
         return
-    # 1. Select prompt based on user's task choice
-    text_prompt = ocr_prompt if task_choice == "Content Extraction" else layout_prompt
-    # 2. Select model and processor
     if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
     elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
     elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
@@ -177,7 +138,12 @@ def process_document_stream(model_name: str, task_choice: str, image: Image.Imag
         yield "Invalid model selected.", "Invalid model selected.", None
         return
-    # 3. Prepare model inputs and streamer
     messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text_prompt}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
@@ -187,41 +153,23 @@ def process_document_stream(model_name: str, task_choice: str, image: Image.Imag
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    # 4. Stream raw output to the UI in real-time
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
-        yield buffer , "⏳ Processing...", {"status": "streaming"}
-    # 5. Post-process the final buffer based on the selected task
-    if task_choice == "Content Extraction":
-        # For OCR, the buffer is the final result.
-        yield buffer, buffer, None
-    else: # Layout Analysis
-        try:
-            json_match = re.search(r'```json\s*([\s\S]+?)\s*```', buffer)
-            if not json_match:
-                # If no JSON block is found, try to parse the whole buffer as a fallback.
-                try:
-                    layout_data = json.loads(buffer)
-                    markdown_content = layoutjson2md(layout_data)
-                    yield buffer, markdown_content, layout_data
-                    return
-                except json.JSONDecodeError:
-                    raise ValueError("JSON object not found in the model's output.")
-            json_str = json_match.group(1)
-            layout_data = json.loads(json_str)
-            markdown_content = layoutjson2md(layout_data)
-            yield buffer, markdown_content, layout_data
-        except Exception as e:
-            error_md = f"❌ **Error:** Failed to parse Layout JSON.\n\n**Details:**\n`{str(e)}`\n\n**Raw Output:**\n```\n{buffer}\n```"
-            error_json = {"error": "ProcessingError", "details": str(e), "raw_output": buffer}
-            yield buffer, error_md, error_json
 # --- Gradio UI Definition ---
 def create_gradio_interface():
@@ -230,13 +178,15 @@ def create_gradio_interface():
     .main-container { max-width: 1400px; margin: 0 auto; }
     .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
     .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
         gr.HTML("""
         <div class="title" style="text-align: center">
             <h1>Tiny VLMs Lab🧪</h1>
             <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
-                Advanced Vision-Language Model for Image Content and Layout Extraction
             </p>
         </div>
         """)
@@ -245,23 +195,39 @@ def create_gradio_interface():
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
-                    choices=["Camel-Doc-OCR-080125",
-                             "MonkeyOCR-Recognition",
-                             "olmOCR-7B-0725",
-                             "Nanonets-OCR-s",
-                             "Megalodon-OCR-Sync-0713"
-                            ],
-                    label="Select Model",
                     value="Nanonets-OCR-s"
                 )
-                task_choice = gr.Dropdown(
-                    choices=["Content Extraction",
-                             "Layout Analysis(.json)"],
-                    label="Select Task", value="Content Extraction"
-                )
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
                 process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
                 clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
@@ -276,14 +242,13 @@ def create_gradio_interface():
                                 examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
                                 inputs=image_input,
                                 label="Examples"
-                        )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
                     with gr.Tab("📰 README.md"):
                         with gr.Accordion("(Formatted Result)", open=True):
                             markdown_output = gr.Markdown(label="Formatted Markdown")
-                    with gr.Tab("📋 Layout Analysis Results"):
-                        json_output = gr.JSON(label="Structured Layout Data (JSON)")
         # Event Handlers
         def clear_all_outputs():
@@ -291,20 +256,12 @@ def create_gradio_interface():
         process_btn.click(
             fn=process_document_stream,
-            inputs=[model_choice,
-                    task_choice,
-                    image_input,
-                    max_new_tokens],
-            outputs=[raw_output_stream,
-                     markdown_output,
-                     json_output]
         )
         clear_btn.click(
-            clear_all_outputs,
-            outputs=[image_input,
-                     raw_output_stream,
-                     markdown_output,
-                     json_output]
         )
     return demo

 import spaces
 import json
 import os
 import traceback
 from io import BytesIO
 from PIL import Image
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.lib import colors
+from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
+from reportlab.lib.units import inch
+import uuid
 # --- Constants and Model Setup ---
 MAX_INPUT_TOKEN_LENGTH = 4096
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 print("Using device:", device)
 # --- Model Loading ---
 MODEL_ID_M = "prithivMLmods/Camel-Doc-OCR-080125"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
     MODEL_ID_I, trust_remote_code=True, torch_dtype=torch.float16
 ).to(device).eval()
+# --- Prompts ---
+ocr_prompt = "Perform precise OCR on the image. Extract all text content, maintaining the original structure, paragraphs, and tables as formatted markdown."
+# --- PDF Generation Functions ---
+def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size):
+    """Generates a PDF document."""
+    filename = f"output_{uuid.uuid4()}.pdf"
+    doc = SimpleDocTemplate(
+        filename,
+        pagesize=A4,
+        rightMargin=inch,
+        leftMargin=inch,
+        topMargin=inch,
+        bottomMargin=inch
+    )
+    styles = getSampleStyleSheet()
+    styles["Normal"].fontSize = int(font_size)
+    styles["Normal"].leading = int(font_size) * line_spacing
+    styles["Normal"].alignment = {
+        "Left": 0,
+        "Center": 1,
+        "Right": 2,
+        "Justified": 4
+    }[alignment]
+    story = []
+    # Add image with size adjustment
+    image_sizes = {
+        "Small": (200, 200),
+        "Medium": (400, 400),
+        "Large": (600, 600)
+    }
+    img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])
+    story.append(img)
+    story.append(Spacer(1, 12))
+    # Add plain text output
+    text = Paragraph(plain_text, styles["Normal"])
+    story.append(text)
+    doc.build(story)
+    return filename
 # --- Core Application Logic ---
 @spaces.GPU
+def process_document_stream(model_name: str, image: Image.Image, max_new_tokens: int, font_size: str, line_spacing: float, alignment: str, image_size: str):
     """
+    Main generator function for OCR task, also generating PDF for preview.
     """
     if image is None:
         yield "Please upload an image.", "Please upload an image.", None
         return
+    # Select model and processor
     if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
     elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
     elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
         yield "Invalid model selected.", "Invalid model selected.", None
         return
+    # Save image temporarily for PDF generation
+    temp_image_path = f"temp_{uuid.uuid4()}.png"
+    image.save(temp_image_path)
+    # Prepare model inputs and streamer
+    text_prompt = ocr_prompt
     messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text_prompt}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    # Stream raw output to the UI in real-time
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
+        # Generate PDF with current buffer
+        pdf_file = generate_pdf(temp_image_path, buffer, font_size, line_spacing, alignment, image_size)
+        yield buffer, buffer, pdf_file
+    # Final PDF with complete output
+    pdf_file = generate_pdf(temp_image_path, buffer, font_size, line_spacing, alignment, image_size)
+    yield buffer, buffer, pdf_file
+    # Clean up temporary image file
+    if os.path.exists(temp_image_path):
+        os.remove(temp_image_path)
 # --- Gradio UI Definition ---
 def create_gradio_interface():
     .main-container { max-width: 1400px; margin: 0 auto; }
     .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
     .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
+    .download-btn { background-color: #35a6d6 !important; color: white !important; }
+    .download-btn:hover { background-color: #22bcff !important; }
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
         gr.HTML("""
         <div class="title" style="text-align: center">
             <h1>Tiny VLMs Lab🧪</h1>
             <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
+                Advanced Vision-Language Model for Image Content Extraction and PDF Generation
             </p>
         </div>
         """)
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
+                    choices=[
+                        "Camel-Doc-OCR-080125",
+                        "MonkeyOCR-Recognition",
+                        "olmOCR-7B-0725",
+                        "Nanonets-OCR-s",
+                        "Megalodon-OCR-Sync-0713"
+                    ],
+                    label="Select Model",
                     value="Nanonets-OCR-s"
                 )
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
+                    font_size = gr.Dropdown(
+                        choices=["8", "10", "12", "14", "16", "18", "20", "22", "24"],
+                        value="16",
+                        label="Font Size"
+                    )
+                    line_spacing = gr.Dropdown(
+                        choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0],
+                        value=1.5,
+                        label="Line Spacing"
+                    )
+                    alignment = gr.Dropdown(
+                        choices=["Left", "Center", "Right", "Justified"],
+                        value="Justified",
+                        label="Text Alignment"
+                    )
+                    image_size = gr.Dropdown(
+                        choices=["Small", "Medium", "Large"],
+                        value="Medium",
+                        label="Image Size"
+                    )
                 process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
                 clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
                                 examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
                                 inputs=image_input,
                                 label="Examples"
+                            )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
                     with gr.Tab("📰 README.md"):
                         with gr.Accordion("(Formatted Result)", open=True):
                             markdown_output = gr.Markdown(label="Formatted Markdown")
+                    with gr.Tab("📋 PDF Preview"):
+                        pdf_output = gr.File(label="Download PDF", interactive=True)
         # Event Handlers
         def clear_all_outputs():
         process_btn.click(
             fn=process_document_stream,
+            inputs=[model_choice, image_input, max_new_tokens, font_size, line_spacing, alignment, image_size],
+            outputs=[raw_output_stream, markdown_output, pdf_output]
         )
         clear_btn.click(
+            fn=clear_all_outputs,
+            outputs=[image_input, raw_output_stream, markdown_output, pdf_output]
         )
     return demo