Spaces:

davanstrien
/

ocr-time-machine

Running on Zero

App Files Files Community

davanstrien HF Staff commited on 30 days ago

Commit

4386729

1 Parent(s): 4af31e3

try adding flux...

Browse files

Files changed (2) hide show

app.py +40 -6
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
 import os
 import torch
 import json
-from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration
 import spaces
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # turn on HF_TRANSFER
@@ -15,7 +15,7 @@ PIPELINES = {}
 MODEL_LOAD_ERROR_MSG = {}
 # Available models
-AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR"]
 # Load RolmOCR
 try:
@@ -60,6 +60,21 @@ except Exception as e:
     MODEL_LOAD_ERROR_MSG["olmOCR"] = f"Failed to load olmOCR: {str(e)}"
     print(f"Error loading olmOCR: {e}")
 # --- Helper Functions ---
@@ -220,7 +235,7 @@ def predict(pil_image, model_name="RolmOCR"):
                 ],
             }
         ]
-    else:  # olmOCR
         messages = [
             {
                 "role": "user",
@@ -233,6 +248,19 @@ def predict(pil_image, model_name="RolmOCR"):
                 ],
             }
         ]
     max_tokens = 8096
     # Use the pipeline with the properly formatted messages
     return selected_pipe(messages, max_new_tokens=max_tokens)
@@ -409,13 +437,14 @@ with gr.Blocks() as demo:
         "to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
         "produces complex XML formats like ALTO, packed with layout details but difficult to use. "
         "Now, Vision-Language Models (VLMs) are revolutionizing OCR with simpler, cleaner output. "
-        "This Space lets you compare three leading VLM-based OCR models against traditional approaches. "
         "Upload a historical document image and its XML file to see them side-by-side. "
         "We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
         "**Available models:**\n"
         "• [RolmOCR](https://huggingface.co/reducto/RolmOCR) - Fast & general-purpose\n"
         "• [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) - Advanced with table/math support\n"
-        "• [olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview) - Allen AI's pioneering 7B document specialist"
     )
     gr.Markdown("---")
@@ -455,7 +484,7 @@ with gr.Blocks() as demo:
                     choices=AVAILABLE_MODELS,
                     value="RolmOCR",
                     label="Choose Model",
-                    info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents",
                 )
             submit_button = gr.Button(
@@ -527,6 +556,11 @@ with gr.Blocks() as demo:
                     "examples/one/74442232.34.xml",
                     "olmOCR",
                 ],
             ],
             inputs=[image_input, xml_input, model_selector],
             outputs=[

 import os
 import torch
 import json
+from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
 import spaces
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # turn on HF_TRANSFER
 MODEL_LOAD_ERROR_MSG = {}
 # Available models
+AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR", "OCRFlux-3B"]
 # Load RolmOCR
 try:
     MODEL_LOAD_ERROR_MSG["olmOCR"] = f"Failed to load olmOCR: {str(e)}"
     print(f"Error loading olmOCR: {e}")
+# Load OCRFlux-3B
+try:
+    PROCESSORS["OCRFlux-3B"] = AutoProcessor.from_pretrained("ChatDOC/OCRFlux-3B")
+    MODELS["OCRFlux-3B"] = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        "ChatDOC/OCRFlux-3B", torch_dtype=torch.bfloat16, device_map="auto"
+    )
+    PIPELINES["OCRFlux-3B"] = pipeline(
+        "image-text-to-text",
+        model=MODELS["OCRFlux-3B"],
+        processor=PROCESSORS["OCRFlux-3B"]
+    )
+except Exception as e:
+    MODEL_LOAD_ERROR_MSG["OCRFlux-3B"] = f"Failed to load OCRFlux-3B: {str(e)}"
+    print(f"Error loading OCRFlux-3B: {e}")
 # --- Helper Functions ---
                 ],
             }
         ]
+    elif model_name == "olmOCR":
         messages = [
             {
                 "role": "user",
                 ],
             }
         ]
+    else:  # OCRFlux-3B
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": pil_image},
+                    {
+                        "type": "text",
+                        "text": "Convert this document page to clean, readable markdown format. Preserve all text content, maintain the original reading order, convert tables to markdown table format, and include any mathematical equations in LaTeX format.",
+                    },
+                ],
+            }
+        ]
     max_tokens = 8096
     # Use the pipeline with the properly formatted messages
     return selected_pipe(messages, max_new_tokens=max_tokens)
         "to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
         "produces complex XML formats like ALTO, packed with layout details but difficult to use. "
         "Now, Vision-Language Models (VLMs) are revolutionizing OCR with simpler, cleaner output. "
+        "This Space lets you compare four leading VLM-based OCR models against traditional approaches. "
         "Upload a historical document image and its XML file to see them side-by-side. "
         "We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
         "**Available models:**\n"
         "• [RolmOCR](https://huggingface.co/reducto/RolmOCR) - Fast & general-purpose\n"
         "• [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) - Advanced with table/math support\n"
+        "• [olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview) - Allen AI's pioneering 7B document specialist\n"
+        "• [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) - Document specialist with table parsing & cross-page merging"
     )
     gr.Markdown("---")
                     choices=AVAILABLE_MODELS,
                     value="RolmOCR",
                     label="Choose Model",
+                    info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents | OCRFlux-3B: Document specialist with cross-page merging",
                 )
             submit_button = gr.Button(
                     "examples/one/74442232.34.xml",
                     "olmOCR",
                 ],
+                [
+                    "examples/one/74442232.3.jpg",
+                    "examples/one/74442232.34.xml",
+                    "OCRFlux-3B",
+                ],
             ],
             inputs=[image_input, xml_input, model_selector],
             outputs=[

requirements.txt CHANGED Viewed

@@ -3,7 +3,8 @@ torch
 gradio
 Pillow
 lxml
-transformers
 spaces
 torchvision
 accelerate

 gradio
 Pillow
 lxml
+transformers>=4.49.0
+qwen-vl-utils
 spaces
 torchvision
 accelerate