Spaces:
Running
Running
from transformers import pipeline, AutoTokenizer | |
import gradio as gr | |
import difflib | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False) | |
model = pipeline( | |
"text2text-generation", | |
model="SuperSl6/Arabic-Text-Correction", | |
tokenizer=tokenizer | |
) | |
def align_and_preserve(original, corrected): | |
original_words = original.split() | |
corrected_words = corrected.split() | |
matcher = difflib.SequenceMatcher(None, original_words, corrected_words) | |
final_output = [] | |
seen_words = set() | |
for opcode, a0, a1, b0, b1 in matcher.get_opcodes(): | |
if opcode == 'equal': | |
for word in corrected_words[b0:b1]: | |
if word not in seen_words: | |
final_output.append(word) | |
seen_words.add(word) | |
elif opcode == 'delete': | |
for word in original_words[a0:a1]: | |
if word not in seen_words: | |
final_output.append(word) | |
seen_words.add(word) | |
elif opcode == 'replace': | |
for word in corrected_words[b0:b1]: | |
if word not in seen_words: | |
final_output.append(word) | |
seen_words.add(word) | |
for word in original_words[a0:a1]: | |
if word not in seen_words: | |
final_output.append(word) | |
seen_words.add(word) | |
for word in corrected_words[b1:]: | |
if word not in seen_words: | |
final_output.append(word) | |
seen_words.add(word) | |
return ' '.join(final_output) | |
def extract_corrected_version(original, generated): | |
sentences = generated.split(' . ') | |
best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio()) | |
corrected_text = align_and_preserve(original, best_match.strip()) | |
return corrected_text | |
def correct_text(input_text): | |
result = model( | |
input_text, | |
max_length=50, | |
no_repeat_ngram_size=2, | |
repetition_penalty=1.5, | |
num_return_sequences=1, | |
temperature=0.7, | |
top_p=0.9, | |
do_sample=True | |
)[0]['generated_text'] | |
corrected_text = extract_corrected_version(input_text, result) | |
return corrected_text | |
# Gradio Interface | |
examples = [ | |
["اكيد ان لحكام العرب والمسلمين مسؤولية يتمثل ادناها في استدعاء السفراء في الصين للتشاور"], | |
["هزا النص يحتوي على الكثير من الاخطاء الاملائية"], | |
["هليكم السلام ورحمة الله وبركاته"], | |
["انشاء الله سيكون كل شيء بخير"] | |
] | |
interface = gr.Interface( | |
fn=correct_text, | |
inputs=gr.Textbox(lines=4, placeholder="✍️ أدخل النص العربي هنا لتصحيحه...", label="📥 النص المدخل"), | |
outputs=gr.Textbox(label="✅ النص المصحح"), | |
title="🚀 تصحيح النص العربي باستخدام SuperSl6/Arabic-Text-Correction", | |
description="📝 أداة ذكية لتصحيح النصوص العربية باستخدام تقنيات الذكاء الاصطناعي. أدخل النص وسيتم تصحيحه في الوقت الفعلي!", | |
theme="compact", | |
examples=examples, | |
allow_flagging="never" | |
) | |
interface.launch() |