Spaces:

SuperSl6
/

Arabic-Text-Correction

Running

App Files Files Community

SuperSl6 commited on Feb 3

Commit

0c64ab6

verified ·

1 Parent(s): 5bd61d5

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -32

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 from transformers import pipeline, AutoTokenizer
 import gradio as gr
-import re
 import difflib
-# Load tokenizer with use_fast=False
 tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False)
 model = pipeline(
     "text2text-generation",
@@ -11,34 +10,47 @@ model = pipeline(
     tokenizer=tokenizer
 )
-def extract_corrected_version(original, generated):
-    # Split generated text into sentences
-    sentences = generated.split(' . ')
-    # Find the sentence most similar to the original
-    best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio())
-    # Extract the corrected Arabic words
-    corrected_words = re.findall(r'[\u0600-\u06FF]+', best_match)
-    # If no corrections found, return the original input
-    if not corrected_words:
-        return original
-    # Check if the corrected text is a proper subset of the generated text
-    corrected_text = ' '.join(corrected_words)
-    if corrected_text in best_match:
-        # Check if the corrected text is the complete output
-        if corrected_text == best_match.strip():
-            return corrected_text
-        else:
-            # If not the complete output, find the shortest corrected phrase
-            for i in range(len(corrected_words), 0, -1):
-                phrase = ' '.join(corrected_words[:i])
-                if phrase in best_match:
-                    return phrase
-    # If no corrected phrase is found, return the original input
-    return original
 def correct_text(input_text):
     result = model(
@@ -52,18 +64,27 @@ def correct_text(input_text):
         do_sample=True
     )[0]['generated_text']
-    # Extract the corrected version
     corrected_text = extract_corrected_version(input_text, result)
     return corrected_text
 # Gradio Interface
 interface = gr.Interface(
     fn=correct_text,
-    inputs=gr.Textbox(lines=3, placeholder="أدخل النص العربي هنا..."),
-    outputs=gr.Textbox(),
     live=True,
-    title="تصحيح النص العربي",
-    description="أداة لتصحيح النصوص العربية باستخدام نموذج SuperSl6/Arabic-Text-Correction."
 )
-interface.launch()

 from transformers import pipeline, AutoTokenizer
 import gradio as gr
 import difflib
+# Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False)
 model = pipeline(
     "text2text-generation",
     tokenizer=tokenizer
 )
+def align_and_preserve(original, corrected):
+    original_words = original.split()
+    corrected_words = corrected.split()
+    matcher = difflib.SequenceMatcher(None, original_words, corrected_words)
+    final_output = []
+    seen_words = set()
+    for opcode, a0, a1, b0, b1 in matcher.get_opcodes():
+        if opcode == 'equal':
+            for word in corrected_words[b0:b1]:
+                if word not in seen_words:
+                    final_output.append(word)
+                    seen_words.add(word)
+        elif opcode == 'delete':
+            for word in original_words[a0:a1]:
+                if word not in seen_words:
+                    final_output.append(word)
+                    seen_words.add(word)
+        elif opcode == 'replace':
+            for word in corrected_words[b0:b1]:
+                if word not in seen_words:
+                    final_output.append(word)
+                    seen_words.add(word)
+            for word in original_words[a0:a1]:
+                if word not in seen_words:
+                    final_output.append(word)
+                    seen_words.add(word)
+    for word in corrected_words[b1:]:
+        if word not in seen_words:
+            final_output.append(word)
+            seen_words.add(word)
+    return ' '.join(final_output)
+def extract_corrected_version(original, generated):
+    sentences = generated.split(' . ')
+    best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio())
+    corrected_text = align_and_preserve(original, best_match.strip())
+    return corrected_text
 def correct_text(input_text):
     result = model(
         do_sample=True
     )[0]['generated_text']
     corrected_text = extract_corrected_version(input_text, result)
     return corrected_text
 # Gradio Interface
+examples = [
+    ["اكيد ان لحكام العرب والمسلمين مسؤولية يتمثل ادناها في استدعاء السفراء في الصين للتشاور"],
+    ["هزا النص يحتوي على الكثير من الاخطاء الاملائية"],
+    ["هليكم السلام ورحمة الله وبركاته"],
+    ["انشاء الله سيكون كل شيء بخير"]
+]
 interface = gr.Interface(
     fn=correct_text,
+    inputs=gr.Textbox(lines=4, placeholder="✍️ أدخل النص العربي هنا لتصحيحه...", label="📥 النص المدخل"),
+    outputs=gr.Textbox(label="✅ النص المصحح"),
     live=True,
+    title="🚀 تصحيح النص العربي باستخدام SuperSl6/Arabic-Text-Correction",
+    description="📝 أداة ذكية لتصحيح النصوص العربية باستخدام تقنيات الذكاء الاصطناعي. أدخل النص وسيتم تصحيحه في الوقت الفعلي!",
+    theme="compact",
+    examples=examples,
+    allow_flagging="never"
 )
+interface.launch()