MohamedRashad commited on
Commit
5f82e6a
Β·
1 Parent(s): 77be916

Add Arabic text detection functionality and update UI for multilingual support

Browse files
Files changed (1) hide show
  1. app.py +61 -6
app.py CHANGED
@@ -185,6 +185,29 @@ def draw_layout_on_image(image: Image.Image, layout_data: List[Dict]) -> Image.I
185
  return img_copy
186
 
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  def layoutjson2md(image: Image.Image, layout_data: List[Dict], text_key: str = 'text') -> str:
189
  """Convert layout JSON to markdown format"""
190
  markdown_lines = []
@@ -494,7 +517,13 @@ def turn_page(direction: str) -> Tuple[Optional[Image.Image], str, str]:
494
  else:
495
  current_result = "Page not processed yet"
496
 
497
- return current_image, page_info, current_result
 
 
 
 
 
 
498
 
499
 
500
  def create_gradio_interface():
@@ -573,9 +602,22 @@ def create_gradio_interface():
573
 
574
  # Header
575
  gr.HTML("""
576
- <div class="header-text">
577
- <h1>πŸ” Dots.OCR Hugging Face Demo</h1>
578
- <p>Advanced OCR and Document Layout Analysis powered by Hugging Face Transformers</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
579
  </div>
580
  """)
581
 
@@ -735,9 +777,15 @@ def create_gradio_interface():
735
  first_result = all_results[0]
736
  combined_markdown = "\n\n---\n\n".join(all_markdown)
737
 
 
 
 
 
 
 
738
  return (
739
  first_result['processed_image'],
740
- combined_markdown,
741
  first_result['raw_output'],
742
  first_result['layout_result'],
743
  '<div class="model-status status-ready">βœ… Processing completed!</div>'
@@ -753,9 +801,16 @@ def create_gradio_interface():
753
  pdf_cache["results"] = [result]
754
  pdf_cache["is_parsed"] = True
755
 
 
 
 
 
 
 
 
756
  return (
757
  result['processed_image'],
758
- result['markdown_content'] or "No content extracted",
759
  result['raw_output'],
760
  result['layout_result'],
761
  '<div class="model-status status-ready">βœ… Processing completed!</div>'
 
185
  return img_copy
186
 
187
 
188
+ def is_arabic_text(text: str) -> bool:
189
+ """Check if text contains mostly Arabic characters"""
190
+ if not text:
191
+ return False
192
+
193
+ # Arabic Unicode ranges
194
+ arabic_chars = 0
195
+ total_chars = 0
196
+
197
+ for char in text:
198
+ if char.isalpha():
199
+ total_chars += 1
200
+ # Arabic script ranges
201
+ if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') or ('\u08A0' <= char <= '\u08FF'):
202
+ arabic_chars += 1
203
+
204
+ if total_chars == 0:
205
+ return False
206
+
207
+ # Consider text as Arabic if more than 50% of alphabetic characters are Arabic
208
+ return (arabic_chars / total_chars) > 0.5
209
+
210
+
211
  def layoutjson2md(image: Image.Image, layout_data: List[Dict], text_key: str = 'text') -> str:
212
  """Convert layout JSON to markdown format"""
213
  markdown_lines = []
 
517
  else:
518
  current_result = "Page not processed yet"
519
 
520
+ # Check if the result contains mostly Arabic text and return appropriate update
521
+ if is_arabic_text(current_result):
522
+ result_update = gr.update(value=current_result, rtl=True)
523
+ else:
524
+ result_update = current_result
525
+
526
+ return current_image, page_info, result_update
527
 
528
 
529
  def create_gradio_interface():
 
602
 
603
  # Header
604
  gr.HTML("""
605
+ <div class="title" style="text-align: center">
606
+ <h1>πŸ” Dot-OCR - Multilingual Document Text Extraction</h1>
607
+ <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
608
+ A state-of-the-art image/pdf-to-markdown vision language model for intelligent document processing
609
+ </p>
610
+ <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
611
+ <a href="https://huggingface.co/rednote-hilab/dots.ocr" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
612
+ πŸ“š Hugging Face Model
613
+ </a>
614
+ <a href="https://github.com/rednote-hilab/dots.ocr/blob/master/assets/blog.md" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
615
+ πŸ“ Release Blog
616
+ </a>
617
+ <a href="https://github.com/rednote-hilab/dots.ocr" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
618
+ πŸ’» GitHub Repository
619
+ </a>
620
+ </div>
621
  </div>
622
  """)
623
 
 
777
  first_result = all_results[0]
778
  combined_markdown = "\n\n---\n\n".join(all_markdown)
779
 
780
+ # Check if the combined markdown contains mostly Arabic text
781
+ if is_arabic_text(combined_markdown):
782
+ markdown_update = gr.update(value=combined_markdown, rtl=True)
783
+ else:
784
+ markdown_update = combined_markdown
785
+
786
  return (
787
  first_result['processed_image'],
788
+ markdown_update,
789
  first_result['raw_output'],
790
  first_result['layout_result'],
791
  '<div class="model-status status-ready">βœ… Processing completed!</div>'
 
801
  pdf_cache["results"] = [result]
802
  pdf_cache["is_parsed"] = True
803
 
804
+ # Check if the content contains mostly Arabic text
805
+ content = result['markdown_content'] or "No content extracted"
806
+ if is_arabic_text(content):
807
+ markdown_update = gr.update(value=content, rtl=True)
808
+ else:
809
+ markdown_update = content
810
+
811
  return (
812
  result['processed_image'],
813
+ markdown_update,
814
  result['raw_output'],
815
  result['layout_result'],
816
  '<div class="model-status status-ready">βœ… Processing completed!</div>'