File size: 2,320 Bytes
227b2b4
d1b79e0
102175e
05cb4d5
d1b79e0
227b2b4
 
a827e42
 
 
 
 
d1b79e0
05cb4d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1b79e0
a827e42
 
102175e
 
 
05cb4d5
 
 
 
a827e42
102175e
05cb4d5
 
102175e
d1b79e0
 
 
 
 
 
227b2b4
d1b79e0
 
 
 
a827e42
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from transformers import pipeline, AutoTokenizer
import gradio as gr
import re
import difflib

# Load tokenizer with use_fast=False
tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False)
model = pipeline(
    "text2text-generation",
    model="SuperSl6/Arabic-Text-Correction",
    tokenizer=tokenizer
)

def extract_corrected_version(original, generated):
    # Split generated text into sentences
    sentences = generated.split(' . ')

    # Find the sentence most similar to the original
    best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio())

    # Extract the corrected Arabic words
    corrected_words = re.findall(r'[\u0600-\u06FF]+', best_match)

    # If no corrections found, return the original input
    if not corrected_words:
        return original

    # Check if the corrected text is a proper subset of the generated text
    corrected_text = ' '.join(corrected_words)
    if corrected_text in best_match:
        # Check if the corrected text is the complete output
        if corrected_text == best_match.strip():
            return corrected_text
        else:
            # If not the complete output, find the shortest corrected phrase
            for i in range(len(corrected_words), 0, -1):
                phrase = ' '.join(corrected_words[:i])
                if phrase in best_match:
                    return phrase
    # If no corrected phrase is found, return the original input
    return original

def correct_text(input_text):
    result = model(
        input_text,
        max_length=50,
        no_repeat_ngram_size=2,
        repetition_penalty=1.5,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )[0]['generated_text']

    # Extract the corrected version
    corrected_text = extract_corrected_version(input_text, result)
    return corrected_text

# Gradio Interface
interface = gr.Interface(
    fn=correct_text,
    inputs=gr.Textbox(lines=3, placeholder="أدخل النص العربي هنا..."),
    outputs=gr.Textbox(),
    live=True,
    title="تصحيح النص العربي",
    description="أداة لتصحيح النصوص العربية باستخدام نموذج SuperSl6/Arabic-Text-Correction."
)

interface.launch()