File size: 3,358 Bytes
227b2b4
d1b79e0
05cb4d5
d1b79e0
0c64ab6
227b2b4
a827e42
 
 
 
 
d1b79e0
0c64ab6
 
 
05cb4d5
0c64ab6
 
 
05cb4d5
0c64ab6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05cb4d5
0c64ab6
 
 
 
05cb4d5
0c64ab6
 
 
 
 
 
 
05cb4d5
d1b79e0
a827e42
 
102175e
 
 
05cb4d5
 
 
 
a827e42
102175e
05cb4d5
102175e
d1b79e0
 
0c64ab6
 
 
 
 
 
 
d1b79e0
 
0c64ab6
 
 
 
 
 
 
d1b79e0
 
0c64ab6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from transformers import pipeline, AutoTokenizer
import gradio as gr
import difflib

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False)
model = pipeline(
    "text2text-generation",
    model="SuperSl6/Arabic-Text-Correction",
    tokenizer=tokenizer
)

def align_and_preserve(original, corrected):
    original_words = original.split()
    corrected_words = corrected.split()

    matcher = difflib.SequenceMatcher(None, original_words, corrected_words)
    final_output = []
    seen_words = set()

    for opcode, a0, a1, b0, b1 in matcher.get_opcodes():
        if opcode == 'equal':
            for word in corrected_words[b0:b1]:
                if word not in seen_words:
                    final_output.append(word)
                    seen_words.add(word)
        elif opcode == 'delete':
            for word in original_words[a0:a1]:
                if word not in seen_words:
                    final_output.append(word)
                    seen_words.add(word)
        elif opcode == 'replace':
            for word in corrected_words[b0:b1]:
                if word not in seen_words:
                    final_output.append(word)
                    seen_words.add(word)
            for word in original_words[a0:a1]:
                if word not in seen_words:
                    final_output.append(word)
                    seen_words.add(word)

    for word in corrected_words[b1:]:
        if word not in seen_words:
            final_output.append(word)
            seen_words.add(word)

    return ' '.join(final_output)

def extract_corrected_version(original, generated):
    sentences = generated.split(' . ')
    best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio())
    corrected_text = align_and_preserve(original, best_match.strip())
    return corrected_text

def correct_text(input_text):
    result = model(
        input_text,
        max_length=50,
        no_repeat_ngram_size=2,
        repetition_penalty=1.5,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )[0]['generated_text']

    corrected_text = extract_corrected_version(input_text, result)
    return corrected_text

# Gradio Interface
examples = [
    ["اكيد ان لحكام العرب والمسلمين مسؤولية يتمثل ادناها في استدعاء السفراء في الصين للتشاور"],
    ["هزا النص يحتوي على الكثير من الاخطاء الاملائية"],
    ["هليكم السلام ورحمة الله وبركاته"],
    ["انشاء الله سيكون كل شيء بخير"]
]

interface = gr.Interface(
    fn=correct_text,
    inputs=gr.Textbox(lines=4, placeholder="✍️ أدخل النص العربي هنا لتصحيحه...", label="📥 النص المدخل"),
    outputs=gr.Textbox(label="✅ النص المصحح"),
    title="🚀 تصحيح النص العربي باستخدام SuperSl6/Arabic-Text-Correction",
    description="📝 أداة ذكية لتصحيح النصوص العربية باستخدام تقنيات الذكاء الاصطناعي. أدخل النص وسيتم تصحيحه في الوقت الفعلي!",
    theme="compact",
    examples=examples,
    allow_flagging="never"
)

interface.launch()