Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
4386729
1
Parent(s):
4af31e3
try adding flux...
Browse files- app.py +40 -6
- requirements.txt +2 -1
app.py
CHANGED
@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
|
|
4 |
import os
|
5 |
import torch
|
6 |
import json
|
7 |
-
from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration
|
8 |
import spaces
|
9 |
|
10 |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
|
@@ -15,7 +15,7 @@ PIPELINES = {}
|
|
15 |
MODEL_LOAD_ERROR_MSG = {}
|
16 |
|
17 |
# Available models
|
18 |
-
AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR"]
|
19 |
|
20 |
# Load RolmOCR
|
21 |
try:
|
@@ -60,6 +60,21 @@ except Exception as e:
|
|
60 |
MODEL_LOAD_ERROR_MSG["olmOCR"] = f"Failed to load olmOCR: {str(e)}"
|
61 |
print(f"Error loading olmOCR: {e}")
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
# --- Helper Functions ---
|
65 |
|
@@ -220,7 +235,7 @@ def predict(pil_image, model_name="RolmOCR"):
|
|
220 |
],
|
221 |
}
|
222 |
]
|
223 |
-
|
224 |
messages = [
|
225 |
{
|
226 |
"role": "user",
|
@@ -233,6 +248,19 @@ def predict(pil_image, model_name="RolmOCR"):
|
|
233 |
],
|
234 |
}
|
235 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
max_tokens = 8096
|
237 |
# Use the pipeline with the properly formatted messages
|
238 |
return selected_pipe(messages, max_new_tokens=max_tokens)
|
@@ -409,13 +437,14 @@ with gr.Blocks() as demo:
|
|
409 |
"to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
|
410 |
"produces complex XML formats like ALTO, packed with layout details but difficult to use. "
|
411 |
"Now, Vision-Language Models (VLMs) are revolutionizing OCR with simpler, cleaner output. "
|
412 |
-
"This Space lets you compare
|
413 |
"Upload a historical document image and its XML file to see them side-by-side. "
|
414 |
"We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
|
415 |
"**Available models:**\n"
|
416 |
"• [RolmOCR](https://huggingface.co/reducto/RolmOCR) - Fast & general-purpose\n"
|
417 |
"• [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) - Advanced with table/math support\n"
|
418 |
-
"• [olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview) - Allen AI's pioneering 7B document specialist"
|
|
|
419 |
)
|
420 |
|
421 |
gr.Markdown("---")
|
@@ -455,7 +484,7 @@ with gr.Blocks() as demo:
|
|
455 |
choices=AVAILABLE_MODELS,
|
456 |
value="RolmOCR",
|
457 |
label="Choose Model",
|
458 |
-
info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents",
|
459 |
)
|
460 |
|
461 |
submit_button = gr.Button(
|
@@ -527,6 +556,11 @@ with gr.Blocks() as demo:
|
|
527 |
"examples/one/74442232.34.xml",
|
528 |
"olmOCR",
|
529 |
],
|
|
|
|
|
|
|
|
|
|
|
530 |
],
|
531 |
inputs=[image_input, xml_input, model_selector],
|
532 |
outputs=[
|
|
|
4 |
import os
|
5 |
import torch
|
6 |
import json
|
7 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
|
8 |
import spaces
|
9 |
|
10 |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
|
|
|
15 |
MODEL_LOAD_ERROR_MSG = {}
|
16 |
|
17 |
# Available models
|
18 |
+
AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR", "OCRFlux-3B"]
|
19 |
|
20 |
# Load RolmOCR
|
21 |
try:
|
|
|
60 |
MODEL_LOAD_ERROR_MSG["olmOCR"] = f"Failed to load olmOCR: {str(e)}"
|
61 |
print(f"Error loading olmOCR: {e}")
|
62 |
|
63 |
+
# Load OCRFlux-3B
|
64 |
+
try:
|
65 |
+
PROCESSORS["OCRFlux-3B"] = AutoProcessor.from_pretrained("ChatDOC/OCRFlux-3B")
|
66 |
+
MODELS["OCRFlux-3B"] = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
67 |
+
"ChatDOC/OCRFlux-3B", torch_dtype=torch.bfloat16, device_map="auto"
|
68 |
+
)
|
69 |
+
PIPELINES["OCRFlux-3B"] = pipeline(
|
70 |
+
"image-text-to-text",
|
71 |
+
model=MODELS["OCRFlux-3B"],
|
72 |
+
processor=PROCESSORS["OCRFlux-3B"]
|
73 |
+
)
|
74 |
+
except Exception as e:
|
75 |
+
MODEL_LOAD_ERROR_MSG["OCRFlux-3B"] = f"Failed to load OCRFlux-3B: {str(e)}"
|
76 |
+
print(f"Error loading OCRFlux-3B: {e}")
|
77 |
+
|
78 |
|
79 |
# --- Helper Functions ---
|
80 |
|
|
|
235 |
],
|
236 |
}
|
237 |
]
|
238 |
+
elif model_name == "olmOCR":
|
239 |
messages = [
|
240 |
{
|
241 |
"role": "user",
|
|
|
248 |
],
|
249 |
}
|
250 |
]
|
251 |
+
else: # OCRFlux-3B
|
252 |
+
messages = [
|
253 |
+
{
|
254 |
+
"role": "user",
|
255 |
+
"content": [
|
256 |
+
{"type": "image", "image": pil_image},
|
257 |
+
{
|
258 |
+
"type": "text",
|
259 |
+
"text": "Convert this document page to clean, readable markdown format. Preserve all text content, maintain the original reading order, convert tables to markdown table format, and include any mathematical equations in LaTeX format.",
|
260 |
+
},
|
261 |
+
],
|
262 |
+
}
|
263 |
+
]
|
264 |
max_tokens = 8096
|
265 |
# Use the pipeline with the properly formatted messages
|
266 |
return selected_pipe(messages, max_new_tokens=max_tokens)
|
|
|
437 |
"to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
|
438 |
"produces complex XML formats like ALTO, packed with layout details but difficult to use. "
|
439 |
"Now, Vision-Language Models (VLMs) are revolutionizing OCR with simpler, cleaner output. "
|
440 |
+
"This Space lets you compare four leading VLM-based OCR models against traditional approaches. "
|
441 |
"Upload a historical document image and its XML file to see them side-by-side. "
|
442 |
"We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
|
443 |
"**Available models:**\n"
|
444 |
"• [RolmOCR](https://huggingface.co/reducto/RolmOCR) - Fast & general-purpose\n"
|
445 |
"• [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) - Advanced with table/math support\n"
|
446 |
+
"• [olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview) - Allen AI's pioneering 7B document specialist\n"
|
447 |
+
"• [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) - Document specialist with table parsing & cross-page merging"
|
448 |
)
|
449 |
|
450 |
gr.Markdown("---")
|
|
|
484 |
choices=AVAILABLE_MODELS,
|
485 |
value="RolmOCR",
|
486 |
label="Choose Model",
|
487 |
+
info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents | OCRFlux-3B: Document specialist with cross-page merging",
|
488 |
)
|
489 |
|
490 |
submit_button = gr.Button(
|
|
|
556 |
"examples/one/74442232.34.xml",
|
557 |
"olmOCR",
|
558 |
],
|
559 |
+
[
|
560 |
+
"examples/one/74442232.3.jpg",
|
561 |
+
"examples/one/74442232.34.xml",
|
562 |
+
"OCRFlux-3B",
|
563 |
+
],
|
564 |
],
|
565 |
inputs=[image_input, xml_input, model_selector],
|
566 |
outputs=[
|
requirements.txt
CHANGED
@@ -3,7 +3,8 @@ torch
|
|
3 |
gradio
|
4 |
Pillow
|
5 |
lxml
|
6 |
-
transformers
|
|
|
7 |
spaces
|
8 |
torchvision
|
9 |
accelerate
|
|
|
3 |
gradio
|
4 |
Pillow
|
5 |
lxml
|
6 |
+
transformers>=4.49.0
|
7 |
+
qwen-vl-utils
|
8 |
spaces
|
9 |
torchvision
|
10 |
accelerate
|