Spaces:
Running
Running
File size: 3,358 Bytes
227b2b4 d1b79e0 05cb4d5 d1b79e0 0c64ab6 227b2b4 a827e42 d1b79e0 0c64ab6 05cb4d5 0c64ab6 05cb4d5 0c64ab6 05cb4d5 0c64ab6 05cb4d5 0c64ab6 05cb4d5 d1b79e0 a827e42 102175e 05cb4d5 a827e42 102175e 05cb4d5 102175e d1b79e0 0c64ab6 d1b79e0 0c64ab6 d1b79e0 0c64ab6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
from transformers import pipeline, AutoTokenizer
import gradio as gr
import difflib
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False)
model = pipeline(
"text2text-generation",
model="SuperSl6/Arabic-Text-Correction",
tokenizer=tokenizer
)
def align_and_preserve(original, corrected):
original_words = original.split()
corrected_words = corrected.split()
matcher = difflib.SequenceMatcher(None, original_words, corrected_words)
final_output = []
seen_words = set()
for opcode, a0, a1, b0, b1 in matcher.get_opcodes():
if opcode == 'equal':
for word in corrected_words[b0:b1]:
if word not in seen_words:
final_output.append(word)
seen_words.add(word)
elif opcode == 'delete':
for word in original_words[a0:a1]:
if word not in seen_words:
final_output.append(word)
seen_words.add(word)
elif opcode == 'replace':
for word in corrected_words[b0:b1]:
if word not in seen_words:
final_output.append(word)
seen_words.add(word)
for word in original_words[a0:a1]:
if word not in seen_words:
final_output.append(word)
seen_words.add(word)
for word in corrected_words[b1:]:
if word not in seen_words:
final_output.append(word)
seen_words.add(word)
return ' '.join(final_output)
def extract_corrected_version(original, generated):
sentences = generated.split(' . ')
best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio())
corrected_text = align_and_preserve(original, best_match.strip())
return corrected_text
def correct_text(input_text):
result = model(
input_text,
max_length=50,
no_repeat_ngram_size=2,
repetition_penalty=1.5,
num_return_sequences=1,
temperature=0.7,
top_p=0.9,
do_sample=True
)[0]['generated_text']
corrected_text = extract_corrected_version(input_text, result)
return corrected_text
# Gradio Interface
examples = [
["اكيد ان لحكام العرب والمسلمين مسؤولية يتمثل ادناها في استدعاء السفراء في الصين للتشاور"],
["هزا النص يحتوي على الكثير من الاخطاء الاملائية"],
["هليكم السلام ورحمة الله وبركاته"],
["انشاء الله سيكون كل شيء بخير"]
]
interface = gr.Interface(
fn=correct_text,
inputs=gr.Textbox(lines=4, placeholder="✍️ أدخل النص العربي هنا لتصحيحه...", label="📥 النص المدخل"),
outputs=gr.Textbox(label="✅ النص المصحح"),
title="🚀 تصحيح النص العربي باستخدام SuperSl6/Arabic-Text-Correction",
description="📝 أداة ذكية لتصحيح النصوص العربية باستخدام تقنيات الذكاء الاصطناعي. أدخل النص وسيتم تصحيحه في الوقت الفعلي!",
theme="compact",
examples=examples,
allow_flagging="never"
)
interface.launch() |