Commit
Β·
5d256ae
1
Parent(s):
34a5af9
Enhance is_arabic_text function: refine detection to focus on headers and paragraphs, excluding lists and code blocks
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ import os
|
|
5 |
import traceback
|
6 |
from io import BytesIO
|
7 |
from typing import Any, Dict, List, Optional, Tuple
|
|
|
8 |
|
9 |
import fitz # PyMuPDF
|
10 |
import gradio as gr
|
@@ -186,15 +187,43 @@ def draw_layout_on_image(image: Image.Image, layout_data: List[Dict]) -> Image.I
|
|
186 |
|
187 |
|
188 |
def is_arabic_text(text: str) -> bool:
|
189 |
-
"""Check if text contains mostly Arabic characters"""
|
190 |
if not text:
|
191 |
return False
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
# Arabic Unicode ranges
|
194 |
arabic_chars = 0
|
195 |
total_chars = 0
|
196 |
|
197 |
-
for char in
|
198 |
if char.isalpha():
|
199 |
total_chars += 1
|
200 |
# Arabic script ranges
|
|
|
5 |
import traceback
|
6 |
from io import BytesIO
|
7 |
from typing import Any, Dict, List, Optional, Tuple
|
8 |
+
import re
|
9 |
|
10 |
import fitz # PyMuPDF
|
11 |
import gradio as gr
|
|
|
187 |
|
188 |
|
189 |
def is_arabic_text(text: str) -> bool:
|
190 |
+
"""Check if text in headers and paragraphs contains mostly Arabic characters"""
|
191 |
if not text:
|
192 |
return False
|
193 |
|
194 |
+
# Extract text from headers and paragraphs only
|
195 |
+
# Match markdown headers (# ## ###) and regular paragraph text
|
196 |
+
header_pattern = r'^#{1,6}\s+(.+)$'
|
197 |
+
paragraph_pattern = r'^(?!#{1,6}\s|!\[|```|\||\s*[-*+]\s|\s*\d+\.\s)(.+)$'
|
198 |
+
|
199 |
+
content_text = []
|
200 |
+
|
201 |
+
for line in text.split('\n'):
|
202 |
+
line = line.strip()
|
203 |
+
if not line:
|
204 |
+
continue
|
205 |
+
|
206 |
+
# Check for headers
|
207 |
+
header_match = re.match(header_pattern, line, re.MULTILINE)
|
208 |
+
if header_match:
|
209 |
+
content_text.append(header_match.group(1))
|
210 |
+
continue
|
211 |
+
|
212 |
+
# Check for paragraph text (exclude lists, tables, code blocks, images)
|
213 |
+
if re.match(paragraph_pattern, line, re.MULTILINE):
|
214 |
+
content_text.append(line)
|
215 |
+
|
216 |
+
if not content_text:
|
217 |
+
return False
|
218 |
+
|
219 |
+
# Join all content text and check for Arabic characters
|
220 |
+
combined_text = ' '.join(content_text)
|
221 |
+
|
222 |
# Arabic Unicode ranges
|
223 |
arabic_chars = 0
|
224 |
total_chars = 0
|
225 |
|
226 |
+
for char in combined_text:
|
227 |
if char.isalpha():
|
228 |
total_chars += 1
|
229 |
# Arabic script ranges
|