Commit
Β·
5f82e6a
1
Parent(s):
77be916
Add Arabic text detection functionality and update UI for multilingual support
Browse files
app.py
CHANGED
@@ -185,6 +185,29 @@ def draw_layout_on_image(image: Image.Image, layout_data: List[Dict]) -> Image.I
|
|
185 |
return img_copy
|
186 |
|
187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
def layoutjson2md(image: Image.Image, layout_data: List[Dict], text_key: str = 'text') -> str:
|
189 |
"""Convert layout JSON to markdown format"""
|
190 |
markdown_lines = []
|
@@ -494,7 +517,13 @@ def turn_page(direction: str) -> Tuple[Optional[Image.Image], str, str]:
|
|
494 |
else:
|
495 |
current_result = "Page not processed yet"
|
496 |
|
497 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
|
499 |
|
500 |
def create_gradio_interface():
|
@@ -573,9 +602,22 @@ def create_gradio_interface():
|
|
573 |
|
574 |
# Header
|
575 |
gr.HTML("""
|
576 |
-
<div class="
|
577 |
-
<h1>π
|
578 |
-
<p
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
579 |
</div>
|
580 |
""")
|
581 |
|
@@ -735,9 +777,15 @@ def create_gradio_interface():
|
|
735 |
first_result = all_results[0]
|
736 |
combined_markdown = "\n\n---\n\n".join(all_markdown)
|
737 |
|
|
|
|
|
|
|
|
|
|
|
|
|
738 |
return (
|
739 |
first_result['processed_image'],
|
740 |
-
|
741 |
first_result['raw_output'],
|
742 |
first_result['layout_result'],
|
743 |
'<div class="model-status status-ready">β
Processing completed!</div>'
|
@@ -753,9 +801,16 @@ def create_gradio_interface():
|
|
753 |
pdf_cache["results"] = [result]
|
754 |
pdf_cache["is_parsed"] = True
|
755 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
756 |
return (
|
757 |
result['processed_image'],
|
758 |
-
|
759 |
result['raw_output'],
|
760 |
result['layout_result'],
|
761 |
'<div class="model-status status-ready">β
Processing completed!</div>'
|
|
|
185 |
return img_copy
|
186 |
|
187 |
|
188 |
+
def is_arabic_text(text: str) -> bool:
|
189 |
+
"""Check if text contains mostly Arabic characters"""
|
190 |
+
if not text:
|
191 |
+
return False
|
192 |
+
|
193 |
+
# Arabic Unicode ranges
|
194 |
+
arabic_chars = 0
|
195 |
+
total_chars = 0
|
196 |
+
|
197 |
+
for char in text:
|
198 |
+
if char.isalpha():
|
199 |
+
total_chars += 1
|
200 |
+
# Arabic script ranges
|
201 |
+
if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') or ('\u08A0' <= char <= '\u08FF'):
|
202 |
+
arabic_chars += 1
|
203 |
+
|
204 |
+
if total_chars == 0:
|
205 |
+
return False
|
206 |
+
|
207 |
+
# Consider text as Arabic if more than 50% of alphabetic characters are Arabic
|
208 |
+
return (arabic_chars / total_chars) > 0.5
|
209 |
+
|
210 |
+
|
211 |
def layoutjson2md(image: Image.Image, layout_data: List[Dict], text_key: str = 'text') -> str:
|
212 |
"""Convert layout JSON to markdown format"""
|
213 |
markdown_lines = []
|
|
|
517 |
else:
|
518 |
current_result = "Page not processed yet"
|
519 |
|
520 |
+
# Check if the result contains mostly Arabic text and return appropriate update
|
521 |
+
if is_arabic_text(current_result):
|
522 |
+
result_update = gr.update(value=current_result, rtl=True)
|
523 |
+
else:
|
524 |
+
result_update = current_result
|
525 |
+
|
526 |
+
return current_image, page_info, result_update
|
527 |
|
528 |
|
529 |
def create_gradio_interface():
|
|
|
602 |
|
603 |
# Header
|
604 |
gr.HTML("""
|
605 |
+
<div class="title" style="text-align: center">
|
606 |
+
<h1>π Dot-OCR - Multilingual Document Text Extraction</h1>
|
607 |
+
<p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
|
608 |
+
A state-of-the-art image/pdf-to-markdown vision language model for intelligent document processing
|
609 |
+
</p>
|
610 |
+
<div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
|
611 |
+
<a href="https://huggingface.co/rednote-hilab/dots.ocr" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
|
612 |
+
π Hugging Face Model
|
613 |
+
</a>
|
614 |
+
<a href="https://github.com/rednote-hilab/dots.ocr/blob/master/assets/blog.md" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
|
615 |
+
π Release Blog
|
616 |
+
</a>
|
617 |
+
<a href="https://github.com/rednote-hilab/dots.ocr" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
|
618 |
+
π» GitHub Repository
|
619 |
+
</a>
|
620 |
+
</div>
|
621 |
</div>
|
622 |
""")
|
623 |
|
|
|
777 |
first_result = all_results[0]
|
778 |
combined_markdown = "\n\n---\n\n".join(all_markdown)
|
779 |
|
780 |
+
# Check if the combined markdown contains mostly Arabic text
|
781 |
+
if is_arabic_text(combined_markdown):
|
782 |
+
markdown_update = gr.update(value=combined_markdown, rtl=True)
|
783 |
+
else:
|
784 |
+
markdown_update = combined_markdown
|
785 |
+
|
786 |
return (
|
787 |
first_result['processed_image'],
|
788 |
+
markdown_update,
|
789 |
first_result['raw_output'],
|
790 |
first_result['layout_result'],
|
791 |
'<div class="model-status status-ready">β
Processing completed!</div>'
|
|
|
801 |
pdf_cache["results"] = [result]
|
802 |
pdf_cache["is_parsed"] = True
|
803 |
|
804 |
+
# Check if the content contains mostly Arabic text
|
805 |
+
content = result['markdown_content'] or "No content extracted"
|
806 |
+
if is_arabic_text(content):
|
807 |
+
markdown_update = gr.update(value=content, rtl=True)
|
808 |
+
else:
|
809 |
+
markdown_update = content
|
810 |
+
|
811 |
return (
|
812 |
result['processed_image'],
|
813 |
+
markdown_update,
|
814 |
result['raw_output'],
|
815 |
result['layout_result'],
|
816 |
'<div class="model-status status-ready">β
Processing completed!</div>'
|