Spaces:
Sleeping
Sleeping
from transformers import pipeline, AutoTokenizer | |
import gradio as gr | |
import re | |
import difflib | |
# Load tokenizer with use_fast=False | |
tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False) | |
model = pipeline( | |
"text2text-generation", | |
model="SuperSl6/Arabic-Text-Correction", | |
tokenizer=tokenizer | |
) | |
def extract_corrected_version(original, generated): | |
# Split generated text into sentences | |
sentences = generated.split(' . ') | |
# Find the sentence most similar to the original | |
best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio()) | |
# Extract the corrected Arabic words | |
corrected_words = re.findall(r'[\u0600-\u06FF]+', best_match) | |
# If no corrections found, return the original input | |
if not corrected_words: | |
return original | |
# Check if the corrected text is a proper subset of the generated text | |
corrected_text = ' '.join(corrected_words) | |
if corrected_text in best_match: | |
# Check if the corrected text is the complete output | |
if corrected_text == best_match.strip(): | |
return corrected_text | |
else: | |
# If not the complete output, find the shortest corrected phrase | |
for i in range(len(corrected_words), 0, -1): | |
phrase = ' '.join(corrected_words[:i]) | |
if phrase in best_match: | |
return phrase | |
# If no corrected phrase is found, return the original input | |
return original | |
def correct_text(input_text): | |
result = model( | |
input_text, | |
max_length=50, | |
no_repeat_ngram_size=2, | |
repetition_penalty=1.5, | |
num_return_sequences=1, | |
temperature=0.7, | |
top_p=0.9, | |
do_sample=True | |
)[0]['generated_text'] | |
# Extract the corrected version | |
corrected_text = extract_corrected_version(input_text, result) | |
return corrected_text | |
# Gradio Interface | |
interface = gr.Interface( | |
fn=correct_text, | |
inputs=gr.Textbox(lines=3, placeholder="أدخل النص العربي هنا..."), | |
outputs=gr.Textbox(), | |
live=True, | |
title="تصحيح النص العربي", | |
description="أداة لتصحيح النصوص العربية باستخدام نموذج SuperSl6/Arabic-Text-Correction." | |
) | |
interface.launch() | |