davanstrien HF Staff commited on
Commit
c7a30f7
·
1 Parent(s): c80cc87
Files changed (1) hide show
  1. app.py +37 -5
app.py CHANGED
@@ -3,7 +3,7 @@ from PIL import Image
3
  import xml.etree.ElementTree as ET
4
  import os
5
  import torch
6
- from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
7
  import spaces
8
 
9
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
@@ -14,7 +14,7 @@ PIPELINES = {}
14
  MODEL_LOAD_ERROR_MSG = {}
15
 
16
  # Available models
17
- AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s"]
18
 
19
  # Load RolmOCR
20
  try:
@@ -46,6 +46,19 @@ except Exception as e:
46
  MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
47
  print(f"Error loading Nanonets-OCR-s: {e}")
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # --- Helper Functions ---
51
 
@@ -193,7 +206,7 @@ def predict(pil_image, model_name="RolmOCR"):
193
  ],
194
  }
195
  ]
196
- else: # Nanonets-OCR-s
197
  messages = [
198
  {
199
  "role": "user",
@@ -206,6 +219,19 @@ def predict(pil_image, model_name="RolmOCR"):
206
  ],
207
  }
208
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  max_tokens = 8096
210
  # Use the pipeline with the properly formatted messages
211
  return selected_pipe(messages, max_new_tokens=max_tokens)
@@ -354,7 +380,8 @@ with gr.Blocks() as demo:
354
  "Upload a historical document image and its XML file to compare these approaches side-by-side. "
355
  "We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
356
  "**Available models:** [RolmOCR](https://huggingface.co/reducto/RolmOCR) | "
357
- "[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)"
 
358
  )
359
 
360
  gr.Markdown("---")
@@ -394,7 +421,7 @@ with gr.Blocks() as demo:
394
  choices=AVAILABLE_MODELS,
395
  value="RolmOCR",
396
  label="Choose Model",
397
- info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support",
398
  )
399
 
400
  submit_button = gr.Button(
@@ -461,6 +488,11 @@ with gr.Blocks() as demo:
461
  "examples/one/74442232.34.xml",
462
  "Nanonets-OCR-s",
463
  ],
 
 
 
 
 
464
  ],
465
  inputs=[image_input, xml_input, model_selector],
466
  outputs=[
 
3
  import xml.etree.ElementTree as ET
4
  import os
5
  import torch
6
+ from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration
7
  import spaces
8
 
9
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
 
14
  MODEL_LOAD_ERROR_MSG = {}
15
 
16
  # Available models
17
+ AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR"]
18
 
19
  # Load RolmOCR
20
  try:
 
46
  MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
47
  print(f"Error loading Nanonets-OCR-s: {e}")
48
 
49
+ # Load olmOCR
50
+ try:
51
+ PROCESSORS["olmOCR"] = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
52
+ MODELS["olmOCR"] = Qwen2VLForConditionalGeneration.from_pretrained(
53
+ "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16, device_map="auto"
54
+ )
55
+ PIPELINES["olmOCR"] = pipeline(
56
+ "image-text-to-text", model=MODELS["olmOCR"], processor=PROCESSORS["olmOCR"]
57
+ )
58
+ except Exception as e:
59
+ MODEL_LOAD_ERROR_MSG["olmOCR"] = f"Failed to load olmOCR: {str(e)}"
60
+ print(f"Error loading olmOCR: {e}")
61
+
62
 
63
  # --- Helper Functions ---
64
 
 
206
  ],
207
  }
208
  ]
209
+ elif model_name == "Nanonets-OCR-s":
210
  messages = [
211
  {
212
  "role": "user",
 
219
  ],
220
  }
221
  ]
222
+ else: # olmOCR
223
+ messages = [
224
+ {
225
+ "role": "user",
226
+ "content": [
227
+ {"type": "image", "image": pil_image},
228
+ {
229
+ "type": "text",
230
+ "text": "Extract all text from this document image, preserving the original reading order and layout structure. Return the plain text representation.",
231
+ },
232
+ ],
233
+ }
234
+ ]
235
  max_tokens = 8096
236
  # Use the pipeline with the properly formatted messages
237
  return selected_pipe(messages, max_new_tokens=max_tokens)
 
380
  "Upload a historical document image and its XML file to compare these approaches side-by-side. "
381
  "We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
382
  "**Available models:** [RolmOCR](https://huggingface.co/reducto/RolmOCR) | "
383
+ "[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) | "
384
+ "[olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview)"
385
  )
386
 
387
  gr.Markdown("---")
 
421
  choices=AVAILABLE_MODELS,
422
  value="RolmOCR",
423
  label="Choose Model",
424
+ info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents",
425
  )
426
 
427
  submit_button = gr.Button(
 
488
  "examples/one/74442232.34.xml",
489
  "Nanonets-OCR-s",
490
  ],
491
+ [
492
+ "examples/one/74442232.3.jpg",
493
+ "examples/one/74442232.34.xml",
494
+ "olmOCR",
495
+ ],
496
  ],
497
  inputs=[image_input, xml_input, model_selector],
498
  outputs=[