Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
c7a30f7
1
Parent(s):
c80cc87
add olmo
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@ from PIL import Image
|
|
3 |
import xml.etree.ElementTree as ET
|
4 |
import os
|
5 |
import torch
|
6 |
-
from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
|
7 |
import spaces
|
8 |
|
9 |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
|
@@ -14,7 +14,7 @@ PIPELINES = {}
|
|
14 |
MODEL_LOAD_ERROR_MSG = {}
|
15 |
|
16 |
# Available models
|
17 |
-
AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s"]
|
18 |
|
19 |
# Load RolmOCR
|
20 |
try:
|
@@ -46,6 +46,19 @@ except Exception as e:
|
|
46 |
MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
|
47 |
print(f"Error loading Nanonets-OCR-s: {e}")
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
# --- Helper Functions ---
|
51 |
|
@@ -193,7 +206,7 @@ def predict(pil_image, model_name="RolmOCR"):
|
|
193 |
],
|
194 |
}
|
195 |
]
|
196 |
-
|
197 |
messages = [
|
198 |
{
|
199 |
"role": "user",
|
@@ -206,6 +219,19 @@ def predict(pil_image, model_name="RolmOCR"):
|
|
206 |
],
|
207 |
}
|
208 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
max_tokens = 8096
|
210 |
# Use the pipeline with the properly formatted messages
|
211 |
return selected_pipe(messages, max_new_tokens=max_tokens)
|
@@ -354,7 +380,8 @@ with gr.Blocks() as demo:
|
|
354 |
"Upload a historical document image and its XML file to compare these approaches side-by-side. "
|
355 |
"We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
|
356 |
"**Available models:** [RolmOCR](https://huggingface.co/reducto/RolmOCR) | "
|
357 |
-
"[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)"
|
|
|
358 |
)
|
359 |
|
360 |
gr.Markdown("---")
|
@@ -394,7 +421,7 @@ with gr.Blocks() as demo:
|
|
394 |
choices=AVAILABLE_MODELS,
|
395 |
value="RolmOCR",
|
396 |
label="Choose Model",
|
397 |
-
info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support",
|
398 |
)
|
399 |
|
400 |
submit_button = gr.Button(
|
@@ -461,6 +488,11 @@ with gr.Blocks() as demo:
|
|
461 |
"examples/one/74442232.34.xml",
|
462 |
"Nanonets-OCR-s",
|
463 |
],
|
|
|
|
|
|
|
|
|
|
|
464 |
],
|
465 |
inputs=[image_input, xml_input, model_selector],
|
466 |
outputs=[
|
|
|
3 |
import xml.etree.ElementTree as ET
|
4 |
import os
|
5 |
import torch
|
6 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration
|
7 |
import spaces
|
8 |
|
9 |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
|
|
|
14 |
MODEL_LOAD_ERROR_MSG = {}
|
15 |
|
16 |
# Available models
|
17 |
+
AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR"]
|
18 |
|
19 |
# Load RolmOCR
|
20 |
try:
|
|
|
46 |
MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
|
47 |
print(f"Error loading Nanonets-OCR-s: {e}")
|
48 |
|
49 |
+
# Load olmOCR
|
50 |
+
try:
|
51 |
+
PROCESSORS["olmOCR"] = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
52 |
+
MODELS["olmOCR"] = Qwen2VLForConditionalGeneration.from_pretrained(
|
53 |
+
"allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16, device_map="auto"
|
54 |
+
)
|
55 |
+
PIPELINES["olmOCR"] = pipeline(
|
56 |
+
"image-text-to-text", model=MODELS["olmOCR"], processor=PROCESSORS["olmOCR"]
|
57 |
+
)
|
58 |
+
except Exception as e:
|
59 |
+
MODEL_LOAD_ERROR_MSG["olmOCR"] = f"Failed to load olmOCR: {str(e)}"
|
60 |
+
print(f"Error loading olmOCR: {e}")
|
61 |
+
|
62 |
|
63 |
# --- Helper Functions ---
|
64 |
|
|
|
206 |
],
|
207 |
}
|
208 |
]
|
209 |
+
elif model_name == "Nanonets-OCR-s":
|
210 |
messages = [
|
211 |
{
|
212 |
"role": "user",
|
|
|
219 |
],
|
220 |
}
|
221 |
]
|
222 |
+
else: # olmOCR
|
223 |
+
messages = [
|
224 |
+
{
|
225 |
+
"role": "user",
|
226 |
+
"content": [
|
227 |
+
{"type": "image", "image": pil_image},
|
228 |
+
{
|
229 |
+
"type": "text",
|
230 |
+
"text": "Extract all text from this document image, preserving the original reading order and layout structure. Return the plain text representation.",
|
231 |
+
},
|
232 |
+
],
|
233 |
+
}
|
234 |
+
]
|
235 |
max_tokens = 8096
|
236 |
# Use the pipeline with the properly formatted messages
|
237 |
return selected_pipe(messages, max_new_tokens=max_tokens)
|
|
|
380 |
"Upload a historical document image and its XML file to compare these approaches side-by-side. "
|
381 |
"We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
|
382 |
"**Available models:** [RolmOCR](https://huggingface.co/reducto/RolmOCR) | "
|
383 |
+
"[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) | "
|
384 |
+
"[olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview)"
|
385 |
)
|
386 |
|
387 |
gr.Markdown("---")
|
|
|
421 |
choices=AVAILABLE_MODELS,
|
422 |
value="RolmOCR",
|
423 |
label="Choose Model",
|
424 |
+
info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents",
|
425 |
)
|
426 |
|
427 |
submit_button = gr.Button(
|
|
|
488 |
"examples/one/74442232.34.xml",
|
489 |
"Nanonets-OCR-s",
|
490 |
],
|
491 |
+
[
|
492 |
+
"examples/one/74442232.3.jpg",
|
493 |
+
"examples/one/74442232.34.xml",
|
494 |
+
"olmOCR",
|
495 |
+
],
|
496 |
],
|
497 |
inputs=[image_input, xml_input, model_selector],
|
498 |
outputs=[
|