davanstrien HF Staff commited on
Commit
4386729
·
1 Parent(s): 4af31e3

try adding flux...

Browse files
Files changed (2) hide show
  1. app.py +40 -6
  2. requirements.txt +2 -1
app.py CHANGED
@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
4
  import os
5
  import torch
6
  import json
7
- from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration
8
  import spaces
9
 
10
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
@@ -15,7 +15,7 @@ PIPELINES = {}
15
  MODEL_LOAD_ERROR_MSG = {}
16
 
17
  # Available models
18
- AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR"]
19
 
20
  # Load RolmOCR
21
  try:
@@ -60,6 +60,21 @@ except Exception as e:
60
  MODEL_LOAD_ERROR_MSG["olmOCR"] = f"Failed to load olmOCR: {str(e)}"
61
  print(f"Error loading olmOCR: {e}")
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  # --- Helper Functions ---
65
 
@@ -220,7 +235,7 @@ def predict(pil_image, model_name="RolmOCR"):
220
  ],
221
  }
222
  ]
223
- else: # olmOCR
224
  messages = [
225
  {
226
  "role": "user",
@@ -233,6 +248,19 @@ def predict(pil_image, model_name="RolmOCR"):
233
  ],
234
  }
235
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  max_tokens = 8096
237
  # Use the pipeline with the properly formatted messages
238
  return selected_pipe(messages, max_new_tokens=max_tokens)
@@ -409,13 +437,14 @@ with gr.Blocks() as demo:
409
  "to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
410
  "produces complex XML formats like ALTO, packed with layout details but difficult to use. "
411
  "Now, Vision-Language Models (VLMs) are revolutionizing OCR with simpler, cleaner output. "
412
- "This Space lets you compare three leading VLM-based OCR models against traditional approaches. "
413
  "Upload a historical document image and its XML file to see them side-by-side. "
414
  "We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
415
  "**Available models:**\n"
416
  "• [RolmOCR](https://huggingface.co/reducto/RolmOCR) - Fast & general-purpose\n"
417
  "• [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) - Advanced with table/math support\n"
418
- "• [olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview) - Allen AI's pioneering 7B document specialist"
 
419
  )
420
 
421
  gr.Markdown("---")
@@ -455,7 +484,7 @@ with gr.Blocks() as demo:
455
  choices=AVAILABLE_MODELS,
456
  value="RolmOCR",
457
  label="Choose Model",
458
- info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents",
459
  )
460
 
461
  submit_button = gr.Button(
@@ -527,6 +556,11 @@ with gr.Blocks() as demo:
527
  "examples/one/74442232.34.xml",
528
  "olmOCR",
529
  ],
 
 
 
 
 
530
  ],
531
  inputs=[image_input, xml_input, model_selector],
532
  outputs=[
 
4
  import os
5
  import torch
6
  import json
7
+ from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
8
  import spaces
9
 
10
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
 
15
  MODEL_LOAD_ERROR_MSG = {}
16
 
17
  # Available models
18
+ AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR", "OCRFlux-3B"]
19
 
20
  # Load RolmOCR
21
  try:
 
60
  MODEL_LOAD_ERROR_MSG["olmOCR"] = f"Failed to load olmOCR: {str(e)}"
61
  print(f"Error loading olmOCR: {e}")
62
 
63
+ # Load OCRFlux-3B
64
+ try:
65
+ PROCESSORS["OCRFlux-3B"] = AutoProcessor.from_pretrained("ChatDOC/OCRFlux-3B")
66
+ MODELS["OCRFlux-3B"] = Qwen2_5_VLForConditionalGeneration.from_pretrained(
67
+ "ChatDOC/OCRFlux-3B", torch_dtype=torch.bfloat16, device_map="auto"
68
+ )
69
+ PIPELINES["OCRFlux-3B"] = pipeline(
70
+ "image-text-to-text",
71
+ model=MODELS["OCRFlux-3B"],
72
+ processor=PROCESSORS["OCRFlux-3B"]
73
+ )
74
+ except Exception as e:
75
+ MODEL_LOAD_ERROR_MSG["OCRFlux-3B"] = f"Failed to load OCRFlux-3B: {str(e)}"
76
+ print(f"Error loading OCRFlux-3B: {e}")
77
+
78
 
79
  # --- Helper Functions ---
80
 
 
235
  ],
236
  }
237
  ]
238
+ elif model_name == "olmOCR":
239
  messages = [
240
  {
241
  "role": "user",
 
248
  ],
249
  }
250
  ]
251
+ else: # OCRFlux-3B
252
+ messages = [
253
+ {
254
+ "role": "user",
255
+ "content": [
256
+ {"type": "image", "image": pil_image},
257
+ {
258
+ "type": "text",
259
+ "text": "Convert this document page to clean, readable markdown format. Preserve all text content, maintain the original reading order, convert tables to markdown table format, and include any mathematical equations in LaTeX format.",
260
+ },
261
+ ],
262
+ }
263
+ ]
264
  max_tokens = 8096
265
  # Use the pipeline with the properly formatted messages
266
  return selected_pipe(messages, max_new_tokens=max_tokens)
 
437
  "to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
438
  "produces complex XML formats like ALTO, packed with layout details but difficult to use. "
439
  "Now, Vision-Language Models (VLMs) are revolutionizing OCR with simpler, cleaner output. "
440
+ "This Space lets you compare four leading VLM-based OCR models against traditional approaches. "
441
  "Upload a historical document image and its XML file to see them side-by-side. "
442
  "We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content.\n\n"
443
  "**Available models:**\n"
444
  "• [RolmOCR](https://huggingface.co/reducto/RolmOCR) - Fast & general-purpose\n"
445
  "• [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) - Advanced with table/math support\n"
446
+ "• [olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview) - Allen AI's pioneering 7B document specialist\n"
447
+ "• [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) - Document specialist with table parsing & cross-page merging"
448
  )
449
 
450
  gr.Markdown("---")
 
484
  choices=AVAILABLE_MODELS,
485
  value="RolmOCR",
486
  label="Choose Model",
487
+ info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents | OCRFlux-3B: Document specialist with cross-page merging",
488
  )
489
 
490
  submit_button = gr.Button(
 
556
  "examples/one/74442232.34.xml",
557
  "olmOCR",
558
  ],
559
+ [
560
+ "examples/one/74442232.3.jpg",
561
+ "examples/one/74442232.34.xml",
562
+ "OCRFlux-3B",
563
+ ],
564
  ],
565
  inputs=[image_input, xml_input, model_selector],
566
  outputs=[
requirements.txt CHANGED
@@ -3,7 +3,8 @@ torch
3
  gradio
4
  Pillow
5
  lxml
6
- transformers
 
7
  spaces
8
  torchvision
9
  accelerate
 
3
  gradio
4
  Pillow
5
  lxml
6
+ transformers>=4.49.0
7
+ qwen-vl-utils
8
  spaces
9
  torchvision
10
  accelerate