Spaces:

SuperSl6
/

Arabic-Text-Correction

Sleeping

App Files Files Community

SuperSl6 commited on Feb 3

Commit

05cb4d5

verified ·

1 Parent(s): 416370c

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -5

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from transformers import pipeline, AutoTokenizer
 import gradio as gr
 import re
 # Load tokenizer with use_fast=False
 tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False)
@@ -10,19 +11,49 @@ model = pipeline(
     tokenizer=tokenizer
 )
 def correct_text(input_text):
     result = model(
         input_text,
         max_length=50,
         no_repeat_ngram_size=2,
         repetition_penalty=1.5,
-        num_return_sequences=1
     )[0]['generated_text']
-    # Extract the first occurrence of corrected Arabic word(s)
-    matches = re.findall(r'[\u0600-\u06FF]+', result)
-    corrected_text = matches[0] if matches else result
     return corrected_text
 # Gradio Interface

 from transformers import pipeline, AutoTokenizer
 import gradio as gr
 import re
+import difflib
 # Load tokenizer with use_fast=False
 tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False)
     tokenizer=tokenizer
 )
+def extract_corrected_version(original, generated):
+    # Split generated text into sentences
+    sentences = generated.split(' . ')
+    # Find the sentence most similar to the original
+    best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio())
+    # Extract the corrected Arabic words
+    corrected_words = re.findall(r'[\u0600-\u06FF]+', best_match)
+    # If no corrections found, return the original input
+    if not corrected_words:
+        return original
+    # Check if the corrected text is a proper subset of the generated text
+    corrected_text = ' '.join(corrected_words)
+    if corrected_text in best_match:
+        # Check if the corrected text is the complete output
+        if corrected_text == best_match.strip():
+            return corrected_text
+        else:
+            # If not the complete output, find the shortest corrected phrase
+            for i in range(len(corrected_words), 0, -1):
+                phrase = ' '.join(corrected_words[:i])
+                if phrase in best_match:
+                    return phrase
+    # If no corrected phrase is found, return the original input
+    return original
 def correct_text(input_text):
     result = model(
         input_text,
         max_length=50,
         no_repeat_ngram_size=2,
         repetition_penalty=1.5,
+        num_return_sequences=1,
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True
     )[0]['generated_text']
+    # Extract the corrected version
+    corrected_text = extract_corrected_version(input_text, result)
     return corrected_text
 # Gradio Interface