Spaces:

davanstrien
/

ocr-time-machine

Running on Zero

App Files Files Community

davanstrien HF Staff commited on Jun 30

Commit

c7a30f7

1 Parent(s): c80cc87

add olmo

Browse files

Files changed (1) hide show

app.py +37 -5

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ from PIL import Image
 import xml.etree.ElementTree as ET
 import os
 import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
 import spaces
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # turn on HF_TRANSFER
@@ -14,7 +14,7 @@ PIPELINES = {}
 MODEL_LOAD_ERROR_MSG = {}
 # Available models
-AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s"]
 # Load RolmOCR
 try:
@@ -46,6 +46,19 @@ except Exception as e:
     MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
     print(f"Error loading Nanonets-OCR-s: {e}")
 # --- Helper Functions ---
@@ -193,7 +206,7 @@ def predict(pil_image, model_name="RolmOCR"):
                 ],
             }
         ]
-    else:  # Nanonets-OCR-s
         messages = [
             {
                 "role": "user",
@@ -206,6 +219,19 @@ def predict(pil_image, model_name="RolmOCR"):
                 ],
             }
         ]
     max_tokens = 8096
     # Use the pipeline with the properly formatted messages
     return selected_pipe(messages, max_new_tokens=max_tokens)
@@ -354,7 +380,8 @@ with gr.Blocks() as demo:
         "Upload a historical document image and its XML file to compare these approaches side-by-side. "
         "We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
         "**Available models:** [RolmOCR](https://huggingface.co/reducto/RolmOCR) | "
-        "[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)"
     )
     gr.Markdown("---")
@@ -394,7 +421,7 @@ with gr.Blocks() as demo:
                     choices=AVAILABLE_MODELS,
                     value="RolmOCR",
                     label="Choose Model",
-                    info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support",
                 )
             submit_button = gr.Button(
@@ -461,6 +488,11 @@ with gr.Blocks() as demo:
                     "examples/one/74442232.34.xml",
                     "Nanonets-OCR-s",
                 ],
             ],
             inputs=[image_input, xml_input, model_selector],
             outputs=[

 import xml.etree.ElementTree as ET
 import os
 import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration
 import spaces
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # turn on HF_TRANSFER
 MODEL_LOAD_ERROR_MSG = {}
 # Available models
+AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR"]
 # Load RolmOCR
 try:
     MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
     print(f"Error loading Nanonets-OCR-s: {e}")
+# Load olmOCR
+try:
+    PROCESSORS["olmOCR"] = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+    MODELS["olmOCR"] = Qwen2VLForConditionalGeneration.from_pretrained(
+        "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16, device_map="auto"
+    )
+    PIPELINES["olmOCR"] = pipeline(
+        "image-text-to-text", model=MODELS["olmOCR"], processor=PROCESSORS["olmOCR"]
+    )
+except Exception as e:
+    MODEL_LOAD_ERROR_MSG["olmOCR"] = f"Failed to load olmOCR: {str(e)}"
+    print(f"Error loading olmOCR: {e}")
 # --- Helper Functions ---
                 ],
             }
         ]
+    elif model_name == "Nanonets-OCR-s":
         messages = [
             {
                 "role": "user",
                 ],
             }
         ]
+    else:  # olmOCR
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": pil_image},
+                    {
+                        "type": "text",
+                        "text": "Extract all text from this document image, preserving the original reading order and layout structure. Return the plain text representation.",
+                    },
+                ],
+            }
+        ]
     max_tokens = 8096
     # Use the pipeline with the properly formatted messages
     return selected_pipe(messages, max_new_tokens=max_tokens)
         "Upload a historical document image and its XML file to compare these approaches side-by-side. "
         "We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
         "**Available models:** [RolmOCR](https://huggingface.co/reducto/RolmOCR) | "
+        "[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) | "
+        "[olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview)"
     )
     gr.Markdown("---")
                     choices=AVAILABLE_MODELS,
                     value="RolmOCR",
                     label="Choose Model",
+                    info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents",
                 )
             submit_button = gr.Button(
                     "examples/one/74442232.34.xml",
                     "Nanonets-OCR-s",
                 ],
+                [
+                    "examples/one/74442232.3.jpg",
+                    "examples/one/74442232.34.xml",
+                    "olmOCR",
+                ],
             ],
             inputs=[image_input, xml_input, model_selector],
             outputs=[