MohamedRashad commited on
Commit
5d256ae
Β·
1 Parent(s): 34a5af9

Enhance is_arabic_text function: refine detection to focus on headers and paragraphs, excluding lists and code blocks

Browse files
Files changed (1) hide show
  1. app.py +31 -2
app.py CHANGED
@@ -5,6 +5,7 @@ import os
5
  import traceback
6
  from io import BytesIO
7
  from typing import Any, Dict, List, Optional, Tuple
 
8
 
9
  import fitz # PyMuPDF
10
  import gradio as gr
@@ -186,15 +187,43 @@ def draw_layout_on_image(image: Image.Image, layout_data: List[Dict]) -> Image.I
186
 
187
 
188
  def is_arabic_text(text: str) -> bool:
189
- """Check if text contains mostly Arabic characters"""
190
  if not text:
191
  return False
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  # Arabic Unicode ranges
194
  arabic_chars = 0
195
  total_chars = 0
196
 
197
- for char in text:
198
  if char.isalpha():
199
  total_chars += 1
200
  # Arabic script ranges
 
5
  import traceback
6
  from io import BytesIO
7
  from typing import Any, Dict, List, Optional, Tuple
8
+ import re
9
 
10
  import fitz # PyMuPDF
11
  import gradio as gr
 
187
 
188
 
189
  def is_arabic_text(text: str) -> bool:
190
+ """Check if text in headers and paragraphs contains mostly Arabic characters"""
191
  if not text:
192
  return False
193
 
194
+ # Extract text from headers and paragraphs only
195
+ # Match markdown headers (# ## ###) and regular paragraph text
196
+ header_pattern = r'^#{1,6}\s+(.+)$'
197
+ paragraph_pattern = r'^(?!#{1,6}\s|!\[|```|\||\s*[-*+]\s|\s*\d+\.\s)(.+)$'
198
+
199
+ content_text = []
200
+
201
+ for line in text.split('\n'):
202
+ line = line.strip()
203
+ if not line:
204
+ continue
205
+
206
+ # Check for headers
207
+ header_match = re.match(header_pattern, line, re.MULTILINE)
208
+ if header_match:
209
+ content_text.append(header_match.group(1))
210
+ continue
211
+
212
+ # Check for paragraph text (exclude lists, tables, code blocks, images)
213
+ if re.match(paragraph_pattern, line, re.MULTILINE):
214
+ content_text.append(line)
215
+
216
+ if not content_text:
217
+ return False
218
+
219
+ # Join all content text and check for Arabic characters
220
+ combined_text = ' '.join(content_text)
221
+
222
  # Arabic Unicode ranges
223
  arabic_chars = 0
224
  total_chars = 0
225
 
226
+ for char in combined_text:
227
  if char.isalpha():
228
  total_chars += 1
229
  # Arabic script ranges