from transformers import pipeline, AutoTokenizer
import gradio as gr
import re
import difflib

# Load tokenizer with use_fast=False
tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False)
model = pipeline(
    "text2text-generation",
    model="SuperSl6/Arabic-Text-Correction",
    tokenizer=tokenizer
)

def extract_corrected_version(original, generated):
    # Split generated text into sentences
    sentences = generated.split(' . ')

    # Find the sentence most similar to the original
    best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio())

    # Extract the corrected Arabic words
    corrected_words = re.findall(r'[\u0600-\u06FF]+', best_match)

    # If no corrections found, return the original input
    if not corrected_words:
        return original

    # Check if the corrected text is a proper subset of the generated text
    corrected_text = ' '.join(corrected_words)
    if corrected_text in best_match:
        # Check if the corrected text is the complete output
        if corrected_text == best_match.strip():
            return corrected_text
        else:
            # If not the complete output, find the shortest corrected phrase
            for i in range(len(corrected_words), 0, -1):
                phrase = ' '.join(corrected_words[:i])
                if phrase in best_match:
                    return phrase
    # If no corrected phrase is found, return the original input
    return original

def correct_text(input_text):
    result = model(
        input_text,
        max_length=50,
        no_repeat_ngram_size=2,
        repetition_penalty=1.5,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )[0]['generated_text']

    # Extract the corrected version
    corrected_text = extract_corrected_version(input_text, result)
    return corrected_text

# Gradio Interface
interface = gr.Interface(
    fn=correct_text,
    inputs=gr.Textbox(lines=3, placeholder="أدخل النص العربي هنا..."),
    outputs=gr.Textbox(),
    live=True,
    title="تصحيح النص العربي",
    description="أداة لتصحيح النصوص العربية باستخدام نموذج SuperSl6/Arabic-Text-Correction."
)

interface.launch()