from transformers import pipeline, AutoTokenizer import gradio as gr import re import difflib # Load tokenizer with use_fast=False tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False) model = pipeline( "text2text-generation", model="SuperSl6/Arabic-Text-Correction", tokenizer=tokenizer ) def extract_corrected_version(original, generated): # Split generated text into sentences sentences = generated.split(' . ') # Find the sentence most similar to the original best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio()) # Extract the corrected Arabic words corrected_words = re.findall(r'[\u0600-\u06FF]+', best_match) # If no corrections found, return the original input if not corrected_words: return original # Check if the corrected text is a proper subset of the generated text corrected_text = ' '.join(corrected_words) if corrected_text in best_match: # Check if the corrected text is the complete output if corrected_text == best_match.strip(): return corrected_text else: # If not the complete output, find the shortest corrected phrase for i in range(len(corrected_words), 0, -1): phrase = ' '.join(corrected_words[:i]) if phrase in best_match: return phrase # If no corrected phrase is found, return the original input return original def correct_text(input_text): result = model( input_text, max_length=50, no_repeat_ngram_size=2, repetition_penalty=1.5, num_return_sequences=1, temperature=0.7, top_p=0.9, do_sample=True )[0]['generated_text'] # Extract the corrected version corrected_text = extract_corrected_version(input_text, result) return corrected_text # Gradio Interface interface = gr.Interface( fn=correct_text, inputs=gr.Textbox(lines=3, placeholder="أدخل النص العربي هنا..."), outputs=gr.Textbox(), live=True, title="تصحيح النص العربي", description="أداة لتصحيح النصوص العربية باستخدام نموذج SuperSl6/Arabic-Text-Correction." ) interface.launch()